inflaton commited on
Commit
d6b30d7
·
1 Parent(s): a991cdc

glm-4 checkpoints

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/added_tokens.json +0 -8
  2. llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/config.json +0 -37
  3. llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/configuration_internlm2.py +0 -180
  4. llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/generation_config.json +0 -9
  5. llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/model-00001-of-00008.safetensors +0 -3
  6. llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/model-00005-of-00008.safetensors +0 -3
  7. llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/model-00006-of-00008.safetensors +0 -3
  8. llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/model-00007-of-00008.safetensors +0 -3
  9. llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/model-00008-of-00008.safetensors +0 -3
  10. llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/model.safetensors.index.json +0 -234
  11. llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/modeling_internlm2.py +0 -1800
  12. llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/tokenization_internlm2.py +0 -236
  13. llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/tokenization_internlm2_fast.py +0 -214
  14. llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/tokenizer.json +0 -0
  15. llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/tokenizer_config.json +0 -1640
  16. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/README.md +70 -0
  17. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/adapter_config.json +31 -0
  18. llama-factory/{merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/model-00002-of-00008.safetensors → saves/glm-4-9b/lora/sft_bf16_p1_full/adapter_model.safetensors} +2 -2
  19. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/added_tokens.json +16 -0
  20. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/all_results.json +13 -0
  21. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350/README.md +202 -0
  22. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350/adapter_config.json +31 -0
  23. llama-factory/{merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/model-00003-of-00008.safetensors → saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350/adapter_model.safetensors} +2 -2
  24. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350/added_tokens.json +16 -0
  25. llama-factory/{merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/model-00004-of-00008.safetensors → saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350/optimizer.pt} +2 -2
  26. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350/rng_state.pth +3 -0
  27. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350/scheduler.pt +3 -0
  28. llama-factory/{merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full → saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350}/special_tokens_map.json +16 -22
  29. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350/tokenization_chatglm.py +323 -0
  30. llama-factory/{merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full → saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350}/tokenizer.model +2 -2
  31. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350/tokenizer_config.json +148 -0
  32. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350/trainer_state.json +296 -0
  33. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350/training_args.bin +3 -0
  34. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/README.md +202 -0
  35. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/adapter_config.json +31 -0
  36. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/adapter_model.safetensors +3 -0
  37. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/added_tokens.json +16 -0
  38. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/optimizer.pt +3 -0
  39. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/rng_state.pth +3 -0
  40. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/scheduler.pt +3 -0
  41. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/special_tokens_map.json +32 -0
  42. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/tokenization_chatglm.py +323 -0
  43. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/tokenizer.model +3 -0
  44. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/tokenizer_config.json +148 -0
  45. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/trainer_state.json +424 -0
  46. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/training_args.bin +3 -0
  47. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-700/README.md +202 -0
  48. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-700/adapter_config.json +31 -0
  49. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-700/adapter_model.safetensors +3 -0
  50. llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-700/added_tokens.json +16 -0
llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/added_tokens.json DELETED
@@ -1,8 +0,0 @@
1
- {
2
- "[UNUSED_TOKEN_141]": 92544,
3
- "[UNUSED_TOKEN_142]": 92545,
4
- "[UNUSED_TOKEN_143]": 92546,
5
- "[UNUSED_TOKEN_144]": 92547,
6
- "[UNUSED_TOKEN_145]": 92548,
7
- "[UNUSED_TOKEN_146]": 92549
8
- }
 
 
 
 
 
 
 
 
 
llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/config.json DELETED
@@ -1,37 +0,0 @@
1
- {
2
- "_name_or_path": "internlm/internlm2_5-7b-chat-1m",
3
- "architectures": [
4
- "InternLM2ForCausalLM"
5
- ],
6
- "attn_implementation": "eager",
7
- "auto_map": {
8
- "AutoConfig": "configuration_internlm2.InternLM2Config",
9
- "AutoModel": "modeling_internlm2.InternLM2ForCausalLM",
10
- "AutoModelForCausalLM": "internlm/internlm2_5-7b-chat-1m--modeling_internlm2.InternLM2ForCausalLM"
11
- },
12
- "bias": false,
13
- "bos_token_id": 1,
14
- "eos_token_id": 2,
15
- "hidden_act": "silu",
16
- "hidden_size": 4096,
17
- "initializer_range": 0.02,
18
- "intermediate_size": 14336,
19
- "max_position_embeddings": 262144,
20
- "model_type": "internlm2",
21
- "num_attention_heads": 32,
22
- "num_hidden_layers": 32,
23
- "num_key_value_heads": 8,
24
- "pad_token_id": 2,
25
- "pretraining_tp": 1,
26
- "rms_norm_eps": 1e-05,
27
- "rope_scaling": {
28
- "factor": 2.5,
29
- "type": "dynamic"
30
- },
31
- "rope_theta": 50000000,
32
- "tie_word_embeddings": false,
33
- "torch_dtype": "bfloat16",
34
- "transformers_version": "4.42.3",
35
- "use_cache": true,
36
- "vocab_size": 92544
37
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/configuration_internlm2.py DELETED
@@ -1,180 +0,0 @@
1
- # coding=utf-8
2
- # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
3
- #
4
- # This code is based on transformers/src/transformers/models/llama/configuration_llama.py
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- #
10
- # http://www.apache.org/licenses/LICENSE-2.0
11
- #
12
- # Unless required by applicable law or agreed to in writing, software
13
- # distributed under the License is distributed on an "AS IS" BASIS,
14
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
- # See the License for the specific language governing permissions and
16
- # limitations under the License.
17
- """ InternLM2 model configuration"""
18
-
19
- from transformers.configuration_utils import PretrainedConfig
20
- from transformers.utils import logging
21
-
22
- logger = logging.get_logger(__name__)
23
-
24
- INTERNLM2_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
25
-
26
-
27
- # Modified from transformers.model.llama.configuration_llama.LlamaConfig
28
- class InternLM2Config(PretrainedConfig):
29
- r"""
30
- This is the configuration class to store the configuration of a [`InternLM2Model`]. It is used to instantiate
31
- an InternLM2 model according to the specified arguments, defining the model architecture. Instantiating a
32
- configuration with the defaults will yield a similar configuration to that of the InternLM2-7B.
33
-
34
- Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
35
- documentation from [`PretrainedConfig`] for more information.
36
-
37
-
38
- Args:
39
- vocab_size (`int`, *optional*, defaults to 32000):
40
- Vocabulary size of the InternLM2 model. Defines the number of different tokens that can be represented by the
41
- `inputs_ids` passed when calling [`InternLM2Model`]
42
- hidden_size (`int`, *optional*, defaults to 4096):
43
- Dimension of the hidden representations.
44
- intermediate_size (`int`, *optional*, defaults to 11008):
45
- Dimension of the MLP representations.
46
- num_hidden_layers (`int`, *optional*, defaults to 32):
47
- Number of hidden layers in the Transformer decoder.
48
- num_attention_heads (`int`, *optional*, defaults to 32):
49
- Number of attention heads for each attention layer in the Transformer decoder.
50
- num_key_value_heads (`int`, *optional*):
51
- This is the number of key_value heads that should be used to implement Grouped Query Attention. If
52
- `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
53
- `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
54
- converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
55
- by meanpooling all the original heads within that group. For more details checkout [this
56
- paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
57
- `num_attention_heads`.
58
- hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
59
- The non-linear activation function (function or string) in the decoder.
60
- max_position_embeddings (`int`, *optional*, defaults to 2048):
61
- The maximum sequence length that this model might ever be used with. InternLM2 supports up to 32768 tokens.
62
- initializer_range (`float`, *optional*, defaults to 0.02):
63
- The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
64
- rms_norm_eps (`float`, *optional*, defaults to 1e-06):
65
- The epsilon used by the rms normalization layers.
66
- use_cache (`bool`, *optional*, defaults to `True`):
67
- Whether or not the model should return the last key/values attentions (not used by all models). Only
68
- relevant if `config.is_decoder=True`.
69
- pad_token_id (`int`, *optional*):
70
- Padding token id.
71
- bos_token_id (`int`, *optional*, defaults to 1):
72
- Beginning of stream token id.
73
- eos_token_id (`int`, *optional*, defaults to 2):
74
- End of stream token id.
75
- pretraining_tp (`int`, *optional*, defaults to 1):
76
- Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
77
- document](https://huggingface.co/docs/transformers/main/perf_train_gpu_many#tensor-parallelism)
78
- to understand more about it. This value is necessary to ensure exact reproducibility
79
- of the pretraining results. Please refer to [this
80
- issue](https://github.com/pytorch/pytorch/issues/76232).
81
- tie_word_embeddings (`bool`, *optional*, defaults to `False`):
82
- Whether to tie weight embeddings
83
- rope_theta (`float`, *optional*, defaults to 10000.0):
84
- The base period of the RoPE embeddings.
85
- rope_scaling (`Dict`, *optional*):
86
- Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
87
- strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
88
- `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
89
- `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
90
- these scaling strategies behave:
91
- https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
92
- experimental feature, subject to breaking API changes in future versions.
93
- """
94
- _auto_class = "AutoConfig"
95
- model_type = "internlm2"
96
- keys_to_ignore_at_inference = ["past_key_values"]
97
-
98
- def __init__( # pylint: disable=W0102
99
- self,
100
- vocab_size=103168,
101
- hidden_size=4096,
102
- intermediate_size=11008,
103
- num_hidden_layers=32,
104
- num_attention_heads=32,
105
- num_key_value_heads=None,
106
- hidden_act="silu",
107
- max_position_embeddings=2048,
108
- initializer_range=0.02,
109
- rms_norm_eps=1e-6,
110
- use_cache=True,
111
- pad_token_id=0,
112
- bos_token_id=1,
113
- eos_token_id=2,
114
- pretraining_tp=1,
115
- tie_word_embeddings=False,
116
- bias=True,
117
- rope_theta=10000,
118
- rope_scaling=None,
119
- attn_implementation=None,
120
- **kwargs,
121
- ):
122
- self.vocab_size = vocab_size
123
- self.max_position_embeddings = max_position_embeddings
124
- self.hidden_size = hidden_size
125
- self.intermediate_size = intermediate_size
126
- self.num_hidden_layers = num_hidden_layers
127
- self.num_attention_heads = num_attention_heads
128
- self.bias = bias
129
-
130
- if num_key_value_heads is None:
131
- num_key_value_heads = num_attention_heads
132
- self.num_key_value_heads = num_key_value_heads
133
-
134
- self.hidden_act = hidden_act
135
- self.initializer_range = initializer_range
136
- self.rms_norm_eps = rms_norm_eps
137
- self.pretraining_tp = pretraining_tp
138
- self.use_cache = use_cache
139
- self.rope_theta = rope_theta
140
- self.rope_scaling = rope_scaling
141
- self._rope_scaling_validation()
142
- self.attn_implementation = attn_implementation
143
- if self.attn_implementation is None:
144
- self.attn_implementation = "eager"
145
-
146
- super().__init__(
147
- pad_token_id=pad_token_id,
148
- bos_token_id=bos_token_id,
149
- eos_token_id=eos_token_id,
150
- tie_word_embeddings=tie_word_embeddings,
151
- **kwargs,
152
- )
153
-
154
- def _rope_scaling_validation(self):
155
- """
156
- Validate the `rope_scaling` configuration.
157
- """
158
- if self.rope_scaling is None:
159
- return
160
-
161
- if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
162
- raise ValueError(
163
- "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
164
- f"got {self.rope_scaling}"
165
- )
166
- rope_scaling_type = self.rope_scaling.get("type", None)
167
- rope_scaling_factor = self.rope_scaling.get("factor", None)
168
- if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
169
- raise ValueError(
170
- f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
171
- )
172
- if (
173
- rope_scaling_factor is None
174
- or not isinstance(rope_scaling_factor, (float, int))
175
- or rope_scaling_factor < 1.0
176
- ):
177
- raise ValueError(
178
- f"`rope_scaling`'s factor field must be a number >= 1, got {rope_scaling_factor} "
179
- f"of type {type(rope_scaling_factor)}"
180
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/generation_config.json DELETED
@@ -1,9 +0,0 @@
1
- {
2
- "bos_token_id": 1,
3
- "eos_token_id": [
4
- 2,
5
- 92542
6
- ],
7
- "pad_token_id": 2,
8
- "transformers_version": "4.42.3"
9
- }
 
 
 
 
 
 
 
 
 
 
llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/model-00001-of-00008.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:974b14608ff94120f8a7b69a8319b69c9e102c1dd04898ce680e2f48ee344ef1
3
- size 1949337704
 
 
 
 
llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/model-00005-of-00008.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b7f335d694350b36baa6fd45f208ca6899a5034ceffeef3087f59cdc8fc073c
3
- size 1979780456
 
 
 
 
llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/model-00006-of-00008.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a1fabd10fa2b73b9763c12954bdfde663f8572beb20df4075d522fcf6c5a51cc
3
- size 1946242728
 
 
 
 
llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/model-00007-of-00008.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:ce32061ffd1f9ec4f0c5ff9a0593d0601e852833562c2f925279d06057a9bd94
3
- size 1979780456
 
 
 
 
llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/model-00008-of-00008.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:709d41db0bbf989d866e27b7743c60a9b9445d6051dc47b554a88efe17925b22
3
- size 1748035640
 
 
 
 
llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/model.safetensors.index.json DELETED
@@ -1,234 +0,0 @@
1
- {
2
- "metadata": {
3
- "total_size": 15475417088
4
- },
5
- "weight_map": {
6
- "model.layers.0.attention.wo.weight": "model-00001-of-00008.safetensors",
7
- "model.layers.0.attention.wqkv.weight": "model-00001-of-00008.safetensors",
8
- "model.layers.0.attention_norm.weight": "model-00001-of-00008.safetensors",
9
- "model.layers.0.feed_forward.w1.weight": "model-00001-of-00008.safetensors",
10
- "model.layers.0.feed_forward.w2.weight": "model-00001-of-00008.safetensors",
11
- "model.layers.0.feed_forward.w3.weight": "model-00001-of-00008.safetensors",
12
- "model.layers.0.ffn_norm.weight": "model-00001-of-00008.safetensors",
13
- "model.layers.1.attention.wo.weight": "model-00001-of-00008.safetensors",
14
- "model.layers.1.attention.wqkv.weight": "model-00001-of-00008.safetensors",
15
- "model.layers.1.attention_norm.weight": "model-00001-of-00008.safetensors",
16
- "model.layers.1.feed_forward.w1.weight": "model-00001-of-00008.safetensors",
17
- "model.layers.1.feed_forward.w2.weight": "model-00001-of-00008.safetensors",
18
- "model.layers.1.feed_forward.w3.weight": "model-00001-of-00008.safetensors",
19
- "model.layers.1.ffn_norm.weight": "model-00001-of-00008.safetensors",
20
- "model.layers.10.attention.wo.weight": "model-00003-of-00008.safetensors",
21
- "model.layers.10.attention.wqkv.weight": "model-00003-of-00008.safetensors",
22
- "model.layers.10.attention_norm.weight": "model-00003-of-00008.safetensors",
23
- "model.layers.10.feed_forward.w1.weight": "model-00003-of-00008.safetensors",
24
- "model.layers.10.feed_forward.w2.weight": "model-00003-of-00008.safetensors",
25
- "model.layers.10.feed_forward.w3.weight": "model-00003-of-00008.safetensors",
26
- "model.layers.10.ffn_norm.weight": "model-00003-of-00008.safetensors",
27
- "model.layers.11.attention.wo.weight": "model-00003-of-00008.safetensors",
28
- "model.layers.11.attention.wqkv.weight": "model-00003-of-00008.safetensors",
29
- "model.layers.11.attention_norm.weight": "model-00004-of-00008.safetensors",
30
- "model.layers.11.feed_forward.w1.weight": "model-00003-of-00008.safetensors",
31
- "model.layers.11.feed_forward.w2.weight": "model-00004-of-00008.safetensors",
32
- "model.layers.11.feed_forward.w3.weight": "model-00003-of-00008.safetensors",
33
- "model.layers.11.ffn_norm.weight": "model-00004-of-00008.safetensors",
34
- "model.layers.12.attention.wo.weight": "model-00004-of-00008.safetensors",
35
- "model.layers.12.attention.wqkv.weight": "model-00004-of-00008.safetensors",
36
- "model.layers.12.attention_norm.weight": "model-00004-of-00008.safetensors",
37
- "model.layers.12.feed_forward.w1.weight": "model-00004-of-00008.safetensors",
38
- "model.layers.12.feed_forward.w2.weight": "model-00004-of-00008.safetensors",
39
- "model.layers.12.feed_forward.w3.weight": "model-00004-of-00008.safetensors",
40
- "model.layers.12.ffn_norm.weight": "model-00004-of-00008.safetensors",
41
- "model.layers.13.attention.wo.weight": "model-00004-of-00008.safetensors",
42
- "model.layers.13.attention.wqkv.weight": "model-00004-of-00008.safetensors",
43
- "model.layers.13.attention_norm.weight": "model-00004-of-00008.safetensors",
44
- "model.layers.13.feed_forward.w1.weight": "model-00004-of-00008.safetensors",
45
- "model.layers.13.feed_forward.w2.weight": "model-00004-of-00008.safetensors",
46
- "model.layers.13.feed_forward.w3.weight": "model-00004-of-00008.safetensors",
47
- "model.layers.13.ffn_norm.weight": "model-00004-of-00008.safetensors",
48
- "model.layers.14.attention.wo.weight": "model-00004-of-00008.safetensors",
49
- "model.layers.14.attention.wqkv.weight": "model-00004-of-00008.safetensors",
50
- "model.layers.14.attention_norm.weight": "model-00004-of-00008.safetensors",
51
- "model.layers.14.feed_forward.w1.weight": "model-00004-of-00008.safetensors",
52
- "model.layers.14.feed_forward.w2.weight": "model-00004-of-00008.safetensors",
53
- "model.layers.14.feed_forward.w3.weight": "model-00004-of-00008.safetensors",
54
- "model.layers.14.ffn_norm.weight": "model-00004-of-00008.safetensors",
55
- "model.layers.15.attention.wo.weight": "model-00004-of-00008.safetensors",
56
- "model.layers.15.attention.wqkv.weight": "model-00004-of-00008.safetensors",
57
- "model.layers.15.attention_norm.weight": "model-00004-of-00008.safetensors",
58
- "model.layers.15.feed_forward.w1.weight": "model-00004-of-00008.safetensors",
59
- "model.layers.15.feed_forward.w2.weight": "model-00004-of-00008.safetensors",
60
- "model.layers.15.feed_forward.w3.weight": "model-00004-of-00008.safetensors",
61
- "model.layers.15.ffn_norm.weight": "model-00004-of-00008.safetensors",
62
- "model.layers.16.attention.wo.weight": "model-00004-of-00008.safetensors",
63
- "model.layers.16.attention.wqkv.weight": "model-00004-of-00008.safetensors",
64
- "model.layers.16.attention_norm.weight": "model-00005-of-00008.safetensors",
65
- "model.layers.16.feed_forward.w1.weight": "model-00005-of-00008.safetensors",
66
- "model.layers.16.feed_forward.w2.weight": "model-00005-of-00008.safetensors",
67
- "model.layers.16.feed_forward.w3.weight": "model-00005-of-00008.safetensors",
68
- "model.layers.16.ffn_norm.weight": "model-00005-of-00008.safetensors",
69
- "model.layers.17.attention.wo.weight": "model-00005-of-00008.safetensors",
70
- "model.layers.17.attention.wqkv.weight": "model-00005-of-00008.safetensors",
71
- "model.layers.17.attention_norm.weight": "model-00005-of-00008.safetensors",
72
- "model.layers.17.feed_forward.w1.weight": "model-00005-of-00008.safetensors",
73
- "model.layers.17.feed_forward.w2.weight": "model-00005-of-00008.safetensors",
74
- "model.layers.17.feed_forward.w3.weight": "model-00005-of-00008.safetensors",
75
- "model.layers.17.ffn_norm.weight": "model-00005-of-00008.safetensors",
76
- "model.layers.18.attention.wo.weight": "model-00005-of-00008.safetensors",
77
- "model.layers.18.attention.wqkv.weight": "model-00005-of-00008.safetensors",
78
- "model.layers.18.attention_norm.weight": "model-00005-of-00008.safetensors",
79
- "model.layers.18.feed_forward.w1.weight": "model-00005-of-00008.safetensors",
80
- "model.layers.18.feed_forward.w2.weight": "model-00005-of-00008.safetensors",
81
- "model.layers.18.feed_forward.w3.weight": "model-00005-of-00008.safetensors",
82
- "model.layers.18.ffn_norm.weight": "model-00005-of-00008.safetensors",
83
- "model.layers.19.attention.wo.weight": "model-00005-of-00008.safetensors",
84
- "model.layers.19.attention.wqkv.weight": "model-00005-of-00008.safetensors",
85
- "model.layers.19.attention_norm.weight": "model-00005-of-00008.safetensors",
86
- "model.layers.19.feed_forward.w1.weight": "model-00005-of-00008.safetensors",
87
- "model.layers.19.feed_forward.w2.weight": "model-00005-of-00008.safetensors",
88
- "model.layers.19.feed_forward.w3.weight": "model-00005-of-00008.safetensors",
89
- "model.layers.19.ffn_norm.weight": "model-00005-of-00008.safetensors",
90
- "model.layers.2.attention.wo.weight": "model-00001-of-00008.safetensors",
91
- "model.layers.2.attention.wqkv.weight": "model-00001-of-00008.safetensors",
92
- "model.layers.2.attention_norm.weight": "model-00002-of-00008.safetensors",
93
- "model.layers.2.feed_forward.w1.weight": "model-00001-of-00008.safetensors",
94
- "model.layers.2.feed_forward.w2.weight": "model-00002-of-00008.safetensors",
95
- "model.layers.2.feed_forward.w3.weight": "model-00001-of-00008.safetensors",
96
- "model.layers.2.ffn_norm.weight": "model-00002-of-00008.safetensors",
97
- "model.layers.20.attention.wo.weight": "model-00005-of-00008.safetensors",
98
- "model.layers.20.attention.wqkv.weight": "model-00005-of-00008.safetensors",
99
- "model.layers.20.attention_norm.weight": "model-00006-of-00008.safetensors",
100
- "model.layers.20.feed_forward.w1.weight": "model-00005-of-00008.safetensors",
101
- "model.layers.20.feed_forward.w2.weight": "model-00006-of-00008.safetensors",
102
- "model.layers.20.feed_forward.w3.weight": "model-00005-of-00008.safetensors",
103
- "model.layers.20.ffn_norm.weight": "model-00006-of-00008.safetensors",
104
- "model.layers.21.attention.wo.weight": "model-00006-of-00008.safetensors",
105
- "model.layers.21.attention.wqkv.weight": "model-00006-of-00008.safetensors",
106
- "model.layers.21.attention_norm.weight": "model-00006-of-00008.safetensors",
107
- "model.layers.21.feed_forward.w1.weight": "model-00006-of-00008.safetensors",
108
- "model.layers.21.feed_forward.w2.weight": "model-00006-of-00008.safetensors",
109
- "model.layers.21.feed_forward.w3.weight": "model-00006-of-00008.safetensors",
110
- "model.layers.21.ffn_norm.weight": "model-00006-of-00008.safetensors",
111
- "model.layers.22.attention.wo.weight": "model-00006-of-00008.safetensors",
112
- "model.layers.22.attention.wqkv.weight": "model-00006-of-00008.safetensors",
113
- "model.layers.22.attention_norm.weight": "model-00006-of-00008.safetensors",
114
- "model.layers.22.feed_forward.w1.weight": "model-00006-of-00008.safetensors",
115
- "model.layers.22.feed_forward.w2.weight": "model-00006-of-00008.safetensors",
116
- "model.layers.22.feed_forward.w3.weight": "model-00006-of-00008.safetensors",
117
- "model.layers.22.ffn_norm.weight": "model-00006-of-00008.safetensors",
118
- "model.layers.23.attention.wo.weight": "model-00006-of-00008.safetensors",
119
- "model.layers.23.attention.wqkv.weight": "model-00006-of-00008.safetensors",
120
- "model.layers.23.attention_norm.weight": "model-00006-of-00008.safetensors",
121
- "model.layers.23.feed_forward.w1.weight": "model-00006-of-00008.safetensors",
122
- "model.layers.23.feed_forward.w2.weight": "model-00006-of-00008.safetensors",
123
- "model.layers.23.feed_forward.w3.weight": "model-00006-of-00008.safetensors",
124
- "model.layers.23.ffn_norm.weight": "model-00006-of-00008.safetensors",
125
- "model.layers.24.attention.wo.weight": "model-00006-of-00008.safetensors",
126
- "model.layers.24.attention.wqkv.weight": "model-00006-of-00008.safetensors",
127
- "model.layers.24.attention_norm.weight": "model-00006-of-00008.safetensors",
128
- "model.layers.24.feed_forward.w1.weight": "model-00006-of-00008.safetensors",
129
- "model.layers.24.feed_forward.w2.weight": "model-00006-of-00008.safetensors",
130
- "model.layers.24.feed_forward.w3.weight": "model-00006-of-00008.safetensors",
131
- "model.layers.24.ffn_norm.weight": "model-00006-of-00008.safetensors",
132
- "model.layers.25.attention.wo.weight": "model-00006-of-00008.safetensors",
133
- "model.layers.25.attention.wqkv.weight": "model-00006-of-00008.safetensors",
134
- "model.layers.25.attention_norm.weight": "model-00007-of-00008.safetensors",
135
- "model.layers.25.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
136
- "model.layers.25.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
137
- "model.layers.25.feed_forward.w3.weight": "model-00007-of-00008.safetensors",
138
- "model.layers.25.ffn_norm.weight": "model-00007-of-00008.safetensors",
139
- "model.layers.26.attention.wo.weight": "model-00007-of-00008.safetensors",
140
- "model.layers.26.attention.wqkv.weight": "model-00007-of-00008.safetensors",
141
- "model.layers.26.attention_norm.weight": "model-00007-of-00008.safetensors",
142
- "model.layers.26.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
143
- "model.layers.26.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
144
- "model.layers.26.feed_forward.w3.weight": "model-00007-of-00008.safetensors",
145
- "model.layers.26.ffn_norm.weight": "model-00007-of-00008.safetensors",
146
- "model.layers.27.attention.wo.weight": "model-00007-of-00008.safetensors",
147
- "model.layers.27.attention.wqkv.weight": "model-00007-of-00008.safetensors",
148
- "model.layers.27.attention_norm.weight": "model-00007-of-00008.safetensors",
149
- "model.layers.27.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
150
- "model.layers.27.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
151
- "model.layers.27.feed_forward.w3.weight": "model-00007-of-00008.safetensors",
152
- "model.layers.27.ffn_norm.weight": "model-00007-of-00008.safetensors",
153
- "model.layers.28.attention.wo.weight": "model-00007-of-00008.safetensors",
154
- "model.layers.28.attention.wqkv.weight": "model-00007-of-00008.safetensors",
155
- "model.layers.28.attention_norm.weight": "model-00007-of-00008.safetensors",
156
- "model.layers.28.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
157
- "model.layers.28.feed_forward.w2.weight": "model-00007-of-00008.safetensors",
158
- "model.layers.28.feed_forward.w3.weight": "model-00007-of-00008.safetensors",
159
- "model.layers.28.ffn_norm.weight": "model-00007-of-00008.safetensors",
160
- "model.layers.29.attention.wo.weight": "model-00007-of-00008.safetensors",
161
- "model.layers.29.attention.wqkv.weight": "model-00007-of-00008.safetensors",
162
- "model.layers.29.attention_norm.weight": "model-00008-of-00008.safetensors",
163
- "model.layers.29.feed_forward.w1.weight": "model-00007-of-00008.safetensors",
164
- "model.layers.29.feed_forward.w2.weight": "model-00008-of-00008.safetensors",
165
- "model.layers.29.feed_forward.w3.weight": "model-00007-of-00008.safetensors",
166
- "model.layers.29.ffn_norm.weight": "model-00008-of-00008.safetensors",
167
- "model.layers.3.attention.wo.weight": "model-00002-of-00008.safetensors",
168
- "model.layers.3.attention.wqkv.weight": "model-00002-of-00008.safetensors",
169
- "model.layers.3.attention_norm.weight": "model-00002-of-00008.safetensors",
170
- "model.layers.3.feed_forward.w1.weight": "model-00002-of-00008.safetensors",
171
- "model.layers.3.feed_forward.w2.weight": "model-00002-of-00008.safetensors",
172
- "model.layers.3.feed_forward.w3.weight": "model-00002-of-00008.safetensors",
173
- "model.layers.3.ffn_norm.weight": "model-00002-of-00008.safetensors",
174
- "model.layers.30.attention.wo.weight": "model-00008-of-00008.safetensors",
175
- "model.layers.30.attention.wqkv.weight": "model-00008-of-00008.safetensors",
176
- "model.layers.30.attention_norm.weight": "model-00008-of-00008.safetensors",
177
- "model.layers.30.feed_forward.w1.weight": "model-00008-of-00008.safetensors",
178
- "model.layers.30.feed_forward.w2.weight": "model-00008-of-00008.safetensors",
179
- "model.layers.30.feed_forward.w3.weight": "model-00008-of-00008.safetensors",
180
- "model.layers.30.ffn_norm.weight": "model-00008-of-00008.safetensors",
181
- "model.layers.31.attention.wo.weight": "model-00008-of-00008.safetensors",
182
- "model.layers.31.attention.wqkv.weight": "model-00008-of-00008.safetensors",
183
- "model.layers.31.attention_norm.weight": "model-00008-of-00008.safetensors",
184
- "model.layers.31.feed_forward.w1.weight": "model-00008-of-00008.safetensors",
185
- "model.layers.31.feed_forward.w2.weight": "model-00008-of-00008.safetensors",
186
- "model.layers.31.feed_forward.w3.weight": "model-00008-of-00008.safetensors",
187
- "model.layers.31.ffn_norm.weight": "model-00008-of-00008.safetensors",
188
- "model.layers.4.attention.wo.weight": "model-00002-of-00008.safetensors",
189
- "model.layers.4.attention.wqkv.weight": "model-00002-of-00008.safetensors",
190
- "model.layers.4.attention_norm.weight": "model-00002-of-00008.safetensors",
191
- "model.layers.4.feed_forward.w1.weight": "model-00002-of-00008.safetensors",
192
- "model.layers.4.feed_forward.w2.weight": "model-00002-of-00008.safetensors",
193
- "model.layers.4.feed_forward.w3.weight": "model-00002-of-00008.safetensors",
194
- "model.layers.4.ffn_norm.weight": "model-00002-of-00008.safetensors",
195
- "model.layers.5.attention.wo.weight": "model-00002-of-00008.safetensors",
196
- "model.layers.5.attention.wqkv.weight": "model-00002-of-00008.safetensors",
197
- "model.layers.5.attention_norm.weight": "model-00002-of-00008.safetensors",
198
- "model.layers.5.feed_forward.w1.weight": "model-00002-of-00008.safetensors",
199
- "model.layers.5.feed_forward.w2.weight": "model-00002-of-00008.safetensors",
200
- "model.layers.5.feed_forward.w3.weight": "model-00002-of-00008.safetensors",
201
- "model.layers.5.ffn_norm.weight": "model-00002-of-00008.safetensors",
202
- "model.layers.6.attention.wo.weight": "model-00002-of-00008.safetensors",
203
- "model.layers.6.attention.wqkv.weight": "model-00002-of-00008.safetensors",
204
- "model.layers.6.attention_norm.weight": "model-00002-of-00008.safetensors",
205
- "model.layers.6.feed_forward.w1.weight": "model-00002-of-00008.safetensors",
206
- "model.layers.6.feed_forward.w2.weight": "model-00002-of-00008.safetensors",
207
- "model.layers.6.feed_forward.w3.weight": "model-00002-of-00008.safetensors",
208
- "model.layers.6.ffn_norm.weight": "model-00002-of-00008.safetensors",
209
- "model.layers.7.attention.wo.weight": "model-00002-of-00008.safetensors",
210
- "model.layers.7.attention.wqkv.weight": "model-00002-of-00008.safetensors",
211
- "model.layers.7.attention_norm.weight": "model-00003-of-00008.safetensors",
212
- "model.layers.7.feed_forward.w1.weight": "model-00003-of-00008.safetensors",
213
- "model.layers.7.feed_forward.w2.weight": "model-00003-of-00008.safetensors",
214
- "model.layers.7.feed_forward.w3.weight": "model-00003-of-00008.safetensors",
215
- "model.layers.7.ffn_norm.weight": "model-00003-of-00008.safetensors",
216
- "model.layers.8.attention.wo.weight": "model-00003-of-00008.safetensors",
217
- "model.layers.8.attention.wqkv.weight": "model-00003-of-00008.safetensors",
218
- "model.layers.8.attention_norm.weight": "model-00003-of-00008.safetensors",
219
- "model.layers.8.feed_forward.w1.weight": "model-00003-of-00008.safetensors",
220
- "model.layers.8.feed_forward.w2.weight": "model-00003-of-00008.safetensors",
221
- "model.layers.8.feed_forward.w3.weight": "model-00003-of-00008.safetensors",
222
- "model.layers.8.ffn_norm.weight": "model-00003-of-00008.safetensors",
223
- "model.layers.9.attention.wo.weight": "model-00003-of-00008.safetensors",
224
- "model.layers.9.attention.wqkv.weight": "model-00003-of-00008.safetensors",
225
- "model.layers.9.attention_norm.weight": "model-00003-of-00008.safetensors",
226
- "model.layers.9.feed_forward.w1.weight": "model-00003-of-00008.safetensors",
227
- "model.layers.9.feed_forward.w2.weight": "model-00003-of-00008.safetensors",
228
- "model.layers.9.feed_forward.w3.weight": "model-00003-of-00008.safetensors",
229
- "model.layers.9.ffn_norm.weight": "model-00003-of-00008.safetensors",
230
- "model.norm.weight": "model-00008-of-00008.safetensors",
231
- "model.tok_embeddings.weight": "model-00001-of-00008.safetensors",
232
- "output.weight": "model-00008-of-00008.safetensors"
233
- }
234
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/modeling_internlm2.py DELETED
@@ -1,1800 +0,0 @@
1
- # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
2
- #
3
- # This code is based on transformers/src/transformers/models/llama/modeling_llama.py
4
- #
5
- # Licensed under the Apache License, Version 2.0 (the "License");
6
- # you may not use this file except in compliance with the License.
7
- # You may obtain a copy of the License at
8
- #
9
- # http://www.apache.org/licenses/LICENSE-2.0
10
- #
11
- # Unless required by applicable law or agreed to in writing, software
12
- # distributed under the License is distributed on an "AS IS" BASIS,
13
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
- # See the License for the specific language governing permissions and
15
- # limitations under the License.
16
- """PyTorch InternLM2.5 model."""
17
- import math
18
- import queue
19
- import threading
20
- from typing import List, Optional, Tuple, Union
21
-
22
- import torch
23
- import torch.nn.functional as F
24
- import torch.utils.checkpoint
25
- from einops import rearrange
26
- from torch import nn
27
- from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
28
- from transformers.activations import ACT2FN
29
- from transformers.cache_utils import Cache, DynamicCache, StaticCache
30
- from transformers.modeling_attn_mask_utils import AttentionMaskConverter
31
- from transformers.modeling_outputs import (
32
- BaseModelOutputWithPast,
33
- CausalLMOutputWithPast,
34
- QuestionAnsweringModelOutput,
35
- SequenceClassifierOutputWithPast,
36
- TokenClassifierOutput,
37
- )
38
- from transformers.modeling_utils import PreTrainedModel
39
- from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
40
- from transformers.utils import (
41
- add_start_docstrings,
42
- add_start_docstrings_to_model_forward,
43
- is_flash_attn_greater_or_equal_2_10,
44
- logging,
45
- replace_return_docstrings,
46
- )
47
-
48
- try:
49
- from transformers.generation.streamers import BaseStreamer
50
- except Exception:
51
- BaseStreamer = None
52
-
53
- from .configuration_internlm2 import InternLM2Config
54
-
55
-
56
- try:
57
- from flash_attn import flash_attn_func, flash_attn_varlen_func
58
- from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input
59
- except:
60
- pass
61
-
62
-
63
- logger = logging.get_logger(__name__)
64
-
65
- _CONFIG_FOR_DOC = "InternLM2Config"
66
-
67
-
68
- def _get_unpad_data(attention_mask):
69
- seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
70
- indices = torch.nonzero(attention_mask.flatten(), as_tuple=False).flatten()
71
- max_seqlen_in_batch = seqlens_in_batch.max().item()
72
- cu_seqlens = F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0)) # pylint: disable=E1102
73
- return (
74
- indices,
75
- cu_seqlens,
76
- max_seqlen_in_batch,
77
- )
78
-
79
-
80
- class InternLM2RMSNorm(nn.Module):
81
- """InternLM2RMSNorm is equivalent to T5LayerNorm."""
82
-
83
- def __init__(self, hidden_size, eps=1e-6):
84
- super().__init__()
85
- self.weight = nn.Parameter(torch.ones(hidden_size))
86
- self.variance_epsilon = eps
87
-
88
- def forward(self, hidden_states):
89
- input_dtype = hidden_states.dtype
90
- hidden_states = hidden_states.to(torch.float32)
91
- variance = hidden_states.pow(2).mean(-1, keepdim=True)
92
- hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
93
- return self.weight * hidden_states.to(input_dtype)
94
-
95
-
96
- ALL_LAYERNORM_LAYERS.append(InternLM2RMSNorm)
97
-
98
-
99
- class InternLM2RotaryEmbedding(nn.Module):
100
- """Rotary Position Embedding for the InternLM2 model. Credits to the Reddit user /u/lucidrains."""
101
-
102
- def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
103
- super().__init__()
104
- self.scaling_factor = scaling_factor
105
- self.dim = dim
106
- self.max_position_embeddings = max_position_embeddings
107
- self.base = base
108
- inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
109
- self.register_buffer("inv_freq", inv_freq, persistent=False)
110
- # For BC we register cos and sin cached
111
- self.max_seq_len_cached = max_position_embeddings
112
-
113
- @torch.no_grad()
114
- def forward(self, x, position_ids):
115
- # x: [bs, num_attention_heads, seq_len, head_size]
116
- inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
117
- position_ids_expanded = position_ids[:, None, :].float()
118
- # Force float32 since bfloat16 loses precision on long contexts
119
- # See https://github.com/huggingface/transformers/pull/29285
120
- device_type = x.device.type
121
- device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
122
- with torch.autocast(device_type=device_type, enabled=False):
123
- freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
124
- emb = torch.cat((freqs, freqs), dim=-1)
125
- cos = emb.cos()
126
- sin = emb.sin()
127
- return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
128
-
129
-
130
- class InternLM2LinearScalingRotaryEmbedding(InternLM2RotaryEmbedding):
131
- """InternLM2RotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
132
-
133
- def forward(self, x, position_ids):
134
- # difference to the original RoPE: a scaling factor is aplied to the position ids
135
- position_ids = position_ids.float() / self.scaling_factor
136
- cos, sin = super().forward(x, position_ids)
137
- return cos, sin
138
-
139
-
140
- class InternLM2DynamicNTKScalingRotaryEmbedding(InternLM2RotaryEmbedding):
141
- """InternLM2RotaryEmbedding extended with Dynamic NTK scaling.
142
- Credits to the Reddit users /u/bloc97 and /u/emozilla"""
143
-
144
- def forward(self, x, position_ids):
145
- # difference to the original RoPE: inv_freq is recomputed when the sequence length > original length
146
- seq_len = torch.max(position_ids) + 1
147
- if seq_len > self.max_position_embeddings:
148
- base = self.base * (
149
- (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
150
- ) ** (self.dim / (self.dim - 2))
151
- inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(x.device) / self.dim))
152
- self.register_buffer("inv_freq", inv_freq, persistent=False) # TODO joao: this may break with compilation
153
-
154
- cos, sin = super().forward(x, position_ids)
155
- return cos, sin
156
-
157
-
158
- def rotate_half(x):
159
- """Rotates half the hidden dims of the input."""
160
- x1 = x[..., : x.shape[-1] // 2]
161
- x2 = x[..., x.shape[-1] // 2 :]
162
- return torch.cat((-x2, x1), dim=-1)
163
-
164
-
165
- def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1): # pylint: disable=unused-argument
166
- """Applies Rotary Position Embedding to the query and key tensors.
167
-
168
- Args:
169
- q (`torch.Tensor`): The query tensor.
170
- k (`torch.Tensor`): The key tensor.
171
- cos (`torch.Tensor`): The cosine part of the rotary embedding.
172
- sin (`torch.Tensor`): The sine part of the rotary embedding.
173
- position_ids (`torch.Tensor`, *optional*):
174
- Deprecated and unused.
175
- unsqueeze_dim (`int`, *optional*, defaults to 1):
176
- The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
177
- sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
178
- that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
179
- k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
180
- cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
181
- the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
182
- Returns:
183
- `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
184
- """
185
- cos = cos.unsqueeze(unsqueeze_dim)
186
- sin = sin.unsqueeze(unsqueeze_dim)
187
- q_embed = (q * cos) + (rotate_half(q) * sin)
188
- k_embed = (k * cos) + (rotate_half(k) * sin)
189
- return q_embed, k_embed
190
-
191
-
192
- class InternLM2MLP(nn.Module):
193
- """MLP for InternLM2 model."""
194
-
195
- def __init__(self, config):
196
- super().__init__()
197
- self.config = config
198
- self.hidden_size = config.hidden_size
199
- self.intermediate_size = config.intermediate_size
200
- self.w1 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
201
- self.w3 = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
202
- self.w2 = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
203
- self.act_fn = ACT2FN[config.hidden_act]
204
-
205
- def forward(self, x):
206
- down_proj = self.w2(self.act_fn(self.w1(x)) * self.w3(x))
207
-
208
- return down_proj
209
-
210
-
211
- def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
212
- """
213
- This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
214
- num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
215
- """
216
- batch, num_key_value_heads, slen, head_dim = hidden_states.shape
217
- if n_rep == 1:
218
- return hidden_states
219
- hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
220
- return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
221
-
222
-
223
- class InternLM2Attention(nn.Module):
224
- """Multi-headed attention from 'Attention Is All You Need' paper"""
225
-
226
- def __init__(self, config: InternLM2Config, layer_idx: Optional[int] = None):
227
- super().__init__()
228
- self.config = config
229
- self.layer_idx = layer_idx
230
- if layer_idx is None:
231
- logger.warning_once(
232
- f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
233
- "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
234
- "when creating this class."
235
- )
236
-
237
- self.hidden_size = config.hidden_size
238
- self.num_heads = config.num_attention_heads
239
- self.head_dim = self.hidden_size // self.num_heads
240
- self.num_key_value_heads = config.num_key_value_heads
241
- self.num_key_value_groups = self.num_heads // self.num_key_value_heads
242
- self.max_position_embeddings = config.max_position_embeddings
243
- self.rope_theta = config.rope_theta
244
- self.is_causal = True
245
-
246
- if (self.head_dim * self.num_heads) != self.hidden_size:
247
- raise ValueError(
248
- f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
249
- f" and `num_heads`: {self.num_heads})."
250
- )
251
-
252
- self.wqkv = nn.Linear(
253
- self.hidden_size,
254
- (self.num_heads + 2 * self.num_key_value_heads) * self.head_dim,
255
- bias=config.bias,
256
- )
257
- self.wo = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.bias)
258
-
259
- self._init_rope()
260
-
261
- def _init_rope(self):
262
- if self.config.rope_scaling is None:
263
- self.rotary_emb = InternLM2RotaryEmbedding(
264
- self.head_dim,
265
- max_position_embeddings=self.max_position_embeddings,
266
- base=self.rope_theta,
267
- )
268
- else:
269
- scaling_type = self.config.rope_scaling["type"]
270
- scaling_factor = self.config.rope_scaling["factor"]
271
- if scaling_type == "linear":
272
- self.rotary_emb = InternLM2LinearScalingRotaryEmbedding(
273
- self.head_dim,
274
- max_position_embeddings=self.max_position_embeddings,
275
- scaling_factor=scaling_factor,
276
- base=self.rope_theta,
277
- )
278
- elif scaling_type == "dynamic":
279
- self.rotary_emb = InternLM2DynamicNTKScalingRotaryEmbedding(
280
- self.head_dim,
281
- max_position_embeddings=self.max_position_embeddings,
282
- scaling_factor=scaling_factor,
283
- base=self.rope_theta,
284
- )
285
- else:
286
- raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
287
-
288
- def forward(
289
- self,
290
- hidden_states: torch.Tensor,
291
- attention_mask: Optional[torch.Tensor] = None,
292
- position_ids: Optional[torch.LongTensor] = None,
293
- past_key_value: Optional[Cache] = None,
294
- output_attentions: bool = False,
295
- use_cache: bool = False, # pylint: disable=unused-argument
296
- cache_position: Optional[torch.LongTensor] = None,
297
- ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
298
- bsz, q_len, _ = hidden_states.size()
299
-
300
- if self.config.pretraining_tp > 1:
301
- # split qkv_states by tp size
302
- key_value_slicing = (self.num_key_value_heads * self.head_dim) // self.config.pretraining_tp
303
- qkv_slices = self.wqkv.weight.split(key_value_slicing, dim=0)
304
- qkv_states = torch.cat(
305
- [F.linear(hidden_states, qkv_slice) for qkv_slice in qkv_slices], dim=-1 # pylint: disable=E1102
306
- )
307
- else:
308
- qkv_states = self.wqkv(hidden_states)
309
-
310
- qkv_states = rearrange(
311
- qkv_states,
312
- "b q (h gs d) -> b q h gs d",
313
- gs=2 + self.num_key_value_groups,
314
- d=self.head_dim,
315
- )
316
-
317
- query_states = qkv_states[..., : self.num_key_value_groups, :]
318
- query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d").transpose(1, 2)
319
- key_states = qkv_states[..., -2, :].transpose(1, 2)
320
- value_states = qkv_states[..., -1, :].transpose(1, 2)
321
-
322
- cos, sin = self.rotary_emb(value_states, position_ids)
323
- query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
324
-
325
- if past_key_value is not None:
326
- # sin and cos are specific to RoPE models; cache_position needed for the static cache
327
- cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
328
- key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
329
-
330
- key_states = repeat_kv(key_states, self.num_key_value_groups)
331
- value_states = repeat_kv(value_states, self.num_key_value_groups)
332
-
333
- attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
334
-
335
- if attention_mask is not None: # no matter the length, we just slice it
336
- causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
337
- attn_weights = attn_weights + causal_mask
338
-
339
- # upcast attention to fp32
340
- attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
341
- attn_output = torch.matmul(attn_weights, value_states)
342
-
343
- if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
344
- raise ValueError(
345
- f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
346
- f" {attn_output.size()}"
347
- )
348
-
349
- attn_output = attn_output.transpose(1, 2).contiguous()
350
-
351
- attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
352
-
353
- if self.config.pretraining_tp > 1:
354
- attn_output = attn_output.split(self.hidden_size // self.config.pretraining_tp, dim=2)
355
- o_proj_slices = self.wo.weight.split(self.hidden_size // self.config.pretraining_tp, dim=1)
356
- attn_output = sum(
357
- [
358
- F.linear(attn_output[i], o_proj_slices[i]) # pylint: disable=E1102
359
- for i in range(self.config.pretraining_tp)
360
- ]
361
- )
362
- else:
363
- attn_output = self.wo(attn_output)
364
-
365
- if not output_attentions:
366
- attn_weights = None
367
-
368
- return attn_output, attn_weights, past_key_value
369
-
370
-
371
- class InternLM2FlashAttention2(InternLM2Attention):
372
- """
373
- InternLM2 flash attention module. This module inherits from `InternLM2Attention` as the weights of the module stays
374
- untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
375
- flash attention and deal with padding tokens in case the input contains any of them.
376
- """
377
-
378
- def __init__(self, *args, **kwargs):
379
- super().__init__(*args, **kwargs)
380
-
381
- # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
382
- # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement,
383
- # that was made default for flash_attn>=2.1. This attribute is used to handle this difference.
384
- # Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
385
- # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1)
386
- # produces a wrong mask (top-left).
387
- self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
388
-
389
- def forward(
390
- self,
391
- hidden_states: torch.Tensor,
392
- attention_mask: Optional[torch.LongTensor] = None,
393
- position_ids: Optional[torch.LongTensor] = None,
394
- past_key_value: Optional[Cache] = None,
395
- output_attentions: bool = False,
396
- use_cache: bool = False,
397
- cache_position: Optional[torch.LongTensor] = None,
398
- ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
399
- if isinstance(past_key_value, StaticCache):
400
- raise ValueError(
401
- "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
402
- "make sure to use `sdpa` in the mean time, and open an issue at "
403
- "https://github.com/huggingface/transformers"
404
- )
405
-
406
- output_attentions = False
407
-
408
- bsz, q_len, _ = hidden_states.size()
409
-
410
- qkv_states = self.wqkv(hidden_states)
411
-
412
- qkv_states = rearrange(
413
- qkv_states,
414
- "b q (h gs d) -> b q h gs d",
415
- gs=2 + self.num_key_value_groups,
416
- d=self.head_dim,
417
- )
418
-
419
- query_states = qkv_states[..., : self.num_key_value_groups, :]
420
- query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d")
421
- key_states = qkv_states[..., -2, :]
422
- value_states = qkv_states[..., -1, :]
423
-
424
- query_states = query_states.transpose(1, 2)
425
- key_states = key_states.transpose(1, 2)
426
- value_states = value_states.transpose(1, 2)
427
-
428
- cos, sin = self.rotary_emb(value_states, position_ids)
429
- query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
430
-
431
- if past_key_value is not None:
432
- # sin and cos are specific to RoPE models; cache_position needed for the static cache
433
- cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
434
- key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
435
-
436
- # TODO: These transpose are quite inefficient but Flash Attention requires the layout
437
- # [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
438
- # to be able to avoid many of these transpose/reshape/view.
439
- query_states = query_states.transpose(1, 2)
440
- key_states = key_states.transpose(1, 2)
441
- value_states = value_states.transpose(1, 2)
442
-
443
- # dropout_rate = self.attention_dropout if self.training else 0.0
444
- dropout_rate = 0.0
445
-
446
- # In PEFT, usually we cast the layer norms in float32 for training stability reasons
447
- # therefore the input hidden states gets silently casted in float32. Hence, we need
448
- # cast them back in the correct dtype just to be sure everything works as expected.
449
- # This might slowdown training & inference so it is recommended to not cast the LayerNorms
450
- # in fp32. (InternLM2RMSNorm handles it correctly)
451
-
452
- input_dtype = query_states.dtype
453
- if input_dtype == torch.float32:
454
- if torch.is_autocast_enabled():
455
- target_dtype = torch.get_autocast_gpu_dtype()
456
- # Handle the case where the model is quantized
457
- elif hasattr(self.config, "_pre_quantization_dtype"):
458
- target_dtype = self.config._pre_quantization_dtype
459
- else:
460
- target_dtype = self.wqkv.weight.dtype
461
-
462
- logger.warning_once(
463
- f"The input hidden states seems to be silently casted in float32, this might be related to"
464
- f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
465
- f" {target_dtype}."
466
- )
467
-
468
- query_states = query_states.to(target_dtype)
469
- key_states = key_states.to(target_dtype)
470
- value_states = value_states.to(target_dtype)
471
-
472
- attn_output = self._flash_attention_forward(
473
- query_states, key_states, value_states, attention_mask, q_len, dropout=dropout_rate
474
- )
475
-
476
- attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
477
- attn_output = self.wo(attn_output)
478
-
479
- if not output_attentions:
480
- attn_weights = None
481
-
482
- return attn_output, attn_weights, past_key_value # pylint: disable=E0606
483
-
484
- def _flash_attention_forward(
485
- self, query_states, key_states, value_states, attention_mask, query_length, dropout=0.0, softmax_scale=None
486
- ):
487
- """
488
- Calls the forward method of Flash Attention - if the input hidden states contain at least one padding token
489
- first unpad the input, then computes the attention scores and pad the final attention scores.
490
-
491
- Args:
492
- query_states (`torch.Tensor`):
493
- Input query states to be passed to Flash Attention API
494
- key_states (`torch.Tensor`):
495
- Input key states to be passed to Flash Attention API
496
- value_states (`torch.Tensor`):
497
- Input value states to be passed to Flash Attention API
498
- attention_mask (`torch.Tensor`):
499
- The padding mask - corresponds to a tensor of size `(batch_size, seq_len)` where 0 stands for the
500
- position of padding tokens and 1 for the position of non-padding tokens.
501
- dropout (`float`):
502
- Attention dropout
503
- softmax_scale (`float`, *optional*):
504
- The scaling of QK^T before applying softmax. Default to 1 / sqrt(head_dim)
505
- """
506
- if not self._flash_attn_uses_top_left_mask:
507
- causal = self.is_causal
508
- else:
509
- # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1.
510
- # For details, please see the comment in InternLM2FlashAttention2 __init__.
511
- causal = self.is_causal and query_length != 1
512
-
513
- # Contains at least one padding token in the sequence
514
- if attention_mask is not None:
515
- batch_size = query_states.shape[0]
516
- query_states, key_states, value_states, indices_q, cu_seq_lens, max_seq_lens = self._upad_input(
517
- query_states, key_states, value_states, attention_mask, query_length
518
- )
519
-
520
- cu_seqlens_q, cu_seqlens_k = cu_seq_lens
521
- max_seqlen_in_batch_q, max_seqlen_in_batch_k = max_seq_lens
522
-
523
- attn_output_unpad = flash_attn_varlen_func( # pylint: disable=E0606
524
- query_states,
525
- key_states,
526
- value_states,
527
- cu_seqlens_q=cu_seqlens_q,
528
- cu_seqlens_k=cu_seqlens_k,
529
- max_seqlen_q=max_seqlen_in_batch_q,
530
- max_seqlen_k=max_seqlen_in_batch_k,
531
- dropout_p=dropout,
532
- softmax_scale=softmax_scale,
533
- causal=causal,
534
- )
535
-
536
- attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length) # pylint: disable=E0606
537
- else:
538
- attn_output = flash_attn_func( # pylint: disable=E0606
539
- query_states, key_states, value_states, dropout, softmax_scale=softmax_scale, causal=causal
540
- )
541
-
542
- return attn_output
543
-
544
- def _upad_input(self, query_layer, key_layer, value_layer, attention_mask, query_length):
545
- indices_k, cu_seqlens_k, max_seqlen_in_batch_k = _get_unpad_data(attention_mask)
546
- batch_size, kv_seq_len, num_key_value_heads, head_dim = key_layer.shape
547
-
548
- key_layer = index_first_axis( # pylint: disable=E0606
549
- key_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
550
- )
551
- value_layer = index_first_axis( # pylint: disable=E0606
552
- value_layer.reshape(batch_size * kv_seq_len, num_key_value_heads, head_dim), indices_k
553
- )
554
- if query_length == kv_seq_len:
555
- query_layer = index_first_axis( # pylint: disable=E0606
556
- query_layer.reshape(batch_size * kv_seq_len, self.num_heads, head_dim), indices_k
557
- )
558
- cu_seqlens_q = cu_seqlens_k
559
- max_seqlen_in_batch_q = max_seqlen_in_batch_k
560
- indices_q = indices_k
561
- elif query_length == 1:
562
- max_seqlen_in_batch_q = 1
563
- cu_seqlens_q = torch.arange(
564
- batch_size + 1, dtype=torch.int32, device=query_layer.device
565
- ) # There is a memcpy here, that is very bad.
566
- indices_q = cu_seqlens_q[:-1]
567
- query_layer = query_layer.squeeze(1)
568
- else:
569
- # The -q_len: slice assumes left padding.
570
- attention_mask = attention_mask[:, -query_length:]
571
- query_layer, indices_q, cu_seqlens_q, max_seqlen_in_batch_q = unpad_input( # pylint: disable=E0606
572
- query_layer, attention_mask
573
- )
574
-
575
- return (
576
- query_layer,
577
- key_layer,
578
- value_layer,
579
- indices_q,
580
- (cu_seqlens_q, cu_seqlens_k),
581
- (max_seqlen_in_batch_q, max_seqlen_in_batch_k),
582
- )
583
-
584
-
585
- # Copied from transformers.models.llama.modeling_llama.LllamaSdpaAttention with Llama->InternLM2
586
- class InternLM2SdpaAttention(InternLM2Attention):
587
- """
588
- InternLM2 attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
589
- `InternLM2Attention` as the weights of the module stays untouched. The only changes are on the forward pass
590
- to adapt to SDPA API.
591
- """
592
-
593
- # Adapted from InternLM2Attention.forward
594
- def forward(
595
- self,
596
- hidden_states: torch.Tensor,
597
- attention_mask: Optional[torch.Tensor] = None,
598
- position_ids: Optional[torch.LongTensor] = None,
599
- past_key_value: Optional[Cache] = None,
600
- output_attentions: bool = False,
601
- use_cache: bool = False,
602
- cache_position: Optional[torch.LongTensor] = None,
603
- ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
604
- if output_attentions:
605
- # TODO: Improve this warning with e.g. `model.config.attn_implementation = "manual"`
606
- # once this is implemented.
607
- logger.warning_once(
608
- "InternLM2Model uses InternLM2SdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` "
609
- "does not support `output_attentions=True`. Falling back to the manual attention implementation, "
610
- "but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. "
611
- 'This warning can be removed using the argument `attn_implementation="eager"` when loading the model.'
612
- )
613
- return super().forward(
614
- hidden_states=hidden_states,
615
- attention_mask=attention_mask,
616
- position_ids=position_ids,
617
- past_key_value=past_key_value,
618
- output_attentions=output_attentions,
619
- use_cache=use_cache,
620
- cache_position=cache_position,
621
- )
622
-
623
- bsz, q_len, _ = hidden_states.size()
624
-
625
- qkv_states = self.wqkv(hidden_states)
626
-
627
- qkv_states = rearrange(
628
- qkv_states,
629
- "b q (h gs d) -> b q h gs d",
630
- gs=2 + self.num_key_value_groups,
631
- d=self.head_dim,
632
- )
633
-
634
- query_states = qkv_states[..., : self.num_key_value_groups, :]
635
- query_states = rearrange(query_states, "b q h gs d -> b q (h gs) d")
636
- key_states = qkv_states[..., -2, :]
637
- value_states = qkv_states[..., -1, :]
638
-
639
- query_states = query_states.transpose(1, 2)
640
- key_states = key_states.transpose(1, 2)
641
- value_states = value_states.transpose(1, 2)
642
-
643
- cos, sin = self.rotary_emb(value_states, position_ids)
644
- query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
645
-
646
- if past_key_value is not None:
647
- # sin and cos are specific to RoPE models; cache_position needed for the static cache
648
- cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
649
- key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
650
-
651
- key_states = repeat_kv(key_states, self.num_key_value_groups)
652
- value_states = repeat_kv(value_states, self.num_key_value_groups)
653
-
654
- causal_mask = attention_mask
655
- if attention_mask is not None:
656
- causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
657
-
658
- # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with
659
- # custom attn_mask, Reference: https://github.com/pytorch/pytorch/issues/112577.
660
- if query_states.device.type == "cuda" and causal_mask is not None:
661
- query_states = query_states.contiguous()
662
- key_states = key_states.contiguous()
663
- value_states = value_states.contiguous()
664
-
665
- # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of
666
- # an inline conditional assignment in SDPA to support both torch.compile's dynamic shapes and full graph
667
- # options. An inline conditional prevents dynamic shapes from compiling.
668
- is_causal = bool(causal_mask is None and q_len > 1)
669
-
670
- attn_output = torch.nn.functional.scaled_dot_product_attention( # pylint: disable=E1102
671
- query_states,
672
- key_states,
673
- value_states,
674
- attn_mask=causal_mask,
675
- dropout_p=0.0,
676
- is_causal=is_causal,
677
- )
678
-
679
- attn_output = attn_output.transpose(1, 2).contiguous()
680
- attn_output = attn_output.view(bsz, q_len, self.hidden_size)
681
-
682
- attn_output = self.wo(attn_output)
683
-
684
- return attn_output, None, past_key_value
685
-
686
-
687
- INTERNLM2_ATTENTION_CLASSES = {
688
- "eager": InternLM2Attention,
689
- "flash_attention_2": InternLM2FlashAttention2,
690
- "sdpa": InternLM2SdpaAttention,
691
- }
692
-
693
-
694
- # Modified from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->InternLM2
695
- class InternLM2DecoderLayer(nn.Module):
696
- """InternLM2 Decoder Layer. This module is a single layer of the InternLM2 model."""
697
-
698
- def __init__(self, config: InternLM2Config, layer_idx: int):
699
- super().__init__()
700
- self.hidden_size = config.hidden_size
701
- self.layer_idx = layer_idx
702
-
703
- self.attention = INTERNLM2_ATTENTION_CLASSES[config.attn_implementation](config=config, layer_idx=layer_idx)
704
-
705
- self.feed_forward = InternLM2MLP(config)
706
- self.attention_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
707
- self.ffn_norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
708
-
709
- def forward(
710
- self,
711
- hidden_states: torch.Tensor,
712
- attention_mask: Optional[torch.Tensor] = None,
713
- position_ids: Optional[torch.LongTensor] = None,
714
- past_key_value: Optional[Cache] = None,
715
- output_attentions: Optional[bool] = False,
716
- use_cache: Optional[bool] = False,
717
- cache_position: Optional[torch.LongTensor] = None,
718
- ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
719
- """
720
- Args:
721
- hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
722
- attention_mask (`torch.FloatTensor`, *optional*):
723
- attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
724
- query_sequence_length, key_sequence_length)` if default attention is used.
725
- output_attentions (`bool`, *optional*):
726
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under
727
- returned tensors for more detail.
728
- use_cache (`bool`, *optional*):
729
- If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
730
- (see `past_key_values`).
731
- past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
732
- """
733
- residual = hidden_states
734
-
735
- hidden_states = self.attention_norm(hidden_states)
736
-
737
- # Self Attention
738
- hidden_states, self_attn_weights, present_key_value = self.attention(
739
- hidden_states=hidden_states,
740
- attention_mask=attention_mask,
741
- position_ids=position_ids,
742
- past_key_value=past_key_value,
743
- output_attentions=output_attentions,
744
- use_cache=use_cache,
745
- cache_position=cache_position,
746
- )
747
- hidden_states = residual + hidden_states
748
-
749
- # Fully Connected
750
- residual = hidden_states
751
- hidden_states = self.ffn_norm(hidden_states)
752
- hidden_states = self.feed_forward(hidden_states)
753
- hidden_states = residual + hidden_states
754
-
755
- outputs = (hidden_states,)
756
-
757
- if output_attentions:
758
- outputs += (self_attn_weights,)
759
-
760
- if use_cache:
761
- outputs += (present_key_value,)
762
-
763
- return outputs
764
-
765
-
766
- InternLM2_START_DOCSTRING = r"""
767
- This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
768
- library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
769
- etc.)
770
-
771
- This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
772
- Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
773
- and behavior.
774
-
775
- Parameters:
776
- config ([`InternLM2Config`]):
777
- Model configuration class with all the parameters of the model. Initializing with a config file does not
778
- load the weights associated with the model, only the configuration. Check out the
779
- [`~PreTrainedModel.from_pretrained`] method to load the model weights.
780
- """
781
-
782
-
783
- # Copied from transformers.models.llama.modeling_llama.LlamaPreTrainedModel with Llama->InternLM2
784
- @add_start_docstrings(
785
- "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.",
786
- InternLM2_START_DOCSTRING,
787
- )
788
- class InternLM2PreTrainedModel(PreTrainedModel):
789
- """
790
- InternLM2 pretraiend model's base class.
791
- """
792
-
793
- config_class = InternLM2Config
794
- base_model_prefix = "model"
795
- supports_gradient_checkpointing = True
796
- _no_split_modules = ["InternLM2DecoderLayer"]
797
- _skip_keys_device_placement = ["past_key_values"]
798
- _supports_flash_attn_2 = True
799
- _supports_sdpa = True
800
- _supports_cache_class = True
801
- _supports_quantized_cache = True
802
- _supports_static_cache = True
803
-
804
- def _init_weights(self, module):
805
- std = self.config.initializer_range
806
- if isinstance(module, nn.Linear):
807
- module.weight.data.normal_(mean=0.0, std=std)
808
- if module.bias is not None:
809
- module.bias.data.zero_()
810
- elif isinstance(module, nn.Embedding):
811
- module.weight.data.normal_(mean=0.0, std=std)
812
- if module.padding_idx is not None:
813
- module.weight.data[module.padding_idx].zero_()
814
-
815
-
816
- InternLM2_INPUTS_DOCSTRING = r"""
817
- Args:
818
- input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
819
- Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
820
- it.
821
-
822
- Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
823
- [`PreTrainedTokenizer.__call__`] for details.
824
-
825
- [What are input IDs?](../glossary#input-ids)
826
- attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
827
- Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
828
-
829
- - 1 for tokens that are **not masked**,
830
- - 0 for tokens that are **masked**.
831
-
832
- [What are attention masks?](../glossary#attention-mask)
833
-
834
- Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
835
- [`PreTrainedTokenizer.__call__`] for details.
836
-
837
- If `past_key_values` is used, optionally only the last `input_ids` have to be input (see
838
- `past_key_values`).
839
-
840
- If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
841
- and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
842
- information on the default strategy.
843
-
844
- - 1 indicates the head is **not masked**,
845
- - 0 indicates the head is **masked**.
846
- position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
847
- Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
848
- config.n_positions - 1]`.
849
-
850
- [What are position IDs?](../glossary#position-ids)
851
- past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
852
- Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
853
- blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
854
- returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
855
-
856
- Two formats are allowed:
857
- - a [`~cache_utils.Cache`] instance;
858
- - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
859
- shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
860
- cache format.
861
-
862
- The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
863
- legacy cache format will be returned.
864
-
865
- If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
866
- have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
867
- of shape `(batch_size, sequence_length)`.
868
- inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
869
- Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
870
- is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
871
- model's internal embedding lookup matrix.
872
- use_cache (`bool`, *optional*):
873
- If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
874
- `past_key_values`).
875
- output_attentions (`bool`, *optional*):
876
- Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
877
- tensors for more detail.
878
- output_hidden_states (`bool`, *optional*):
879
- Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
880
- more detail.
881
- return_dict (`bool`, *optional*):
882
- Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
883
- cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
884
- Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
885
- this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
886
- the complete sequence length.
887
- """
888
-
889
-
890
- # Modified from transformers.models.llama.modeling_llama.LlamaModel with Llama->InternLM2
891
- @add_start_docstrings(
892
- "The bare InternLM2 Model outputting raw hidden-states without any specific head on top.",
893
- InternLM2_START_DOCSTRING,
894
- )
895
- class InternLM2Model(InternLM2PreTrainedModel):
896
- """
897
- Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`InternLM2DecoderLayer`]
898
-
899
- Args:
900
- config: InternLM2Config
901
- """
902
-
903
- _auto_class = "AutoModel"
904
-
905
- def __init__(self, config: InternLM2Config):
906
- super().__init__(config)
907
- self.padding_idx = config.pad_token_id
908
- self.vocab_size = config.vocab_size
909
- self.config = config
910
-
911
- self.tok_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
912
-
913
- self.layers = nn.ModuleList(
914
- [InternLM2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
915
- )
916
- self.norm = InternLM2RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
917
-
918
- self.gradient_checkpointing = False
919
- # Initialize weights and apply final processing
920
- self.post_init()
921
-
922
- def get_input_embeddings(self):
923
- return self.tok_embeddings
924
-
925
- def set_input_embeddings(self, value):
926
- self.tok_embeddings = value
927
-
928
- @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
929
- def forward(
930
- self,
931
- input_ids: torch.LongTensor = None,
932
- attention_mask: Optional[torch.Tensor] = None,
933
- position_ids: Optional[torch.LongTensor] = None,
934
- past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
935
- inputs_embeds: Optional[torch.FloatTensor] = None,
936
- use_cache: Optional[bool] = None,
937
- output_attentions: Optional[bool] = None,
938
- output_hidden_states: Optional[bool] = None,
939
- return_dict: Optional[bool] = None,
940
- cache_position: Optional[torch.LongTensor] = None,
941
- ) -> Union[Tuple, BaseModelOutputWithPast]:
942
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
943
- output_hidden_states = (
944
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
945
- )
946
- use_cache = use_cache if use_cache is not None else self.config.use_cache
947
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
948
-
949
- if (input_ids is None) ^ (inputs_embeds is not None):
950
- raise ValueError(
951
- "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
952
- )
953
-
954
- if self.gradient_checkpointing and self.training and use_cache:
955
- logger.warning_once(
956
- "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
957
- )
958
- use_cache = False
959
-
960
- if inputs_embeds is None:
961
- inputs_embeds = self.tok_embeddings(input_ids)
962
-
963
- return_legacy_cache = False
964
- if use_cache and not isinstance(past_key_values, Cache): # kept for BC (non `Cache` `past_key_values` inputs)
965
- return_legacy_cache = True
966
- past_key_values = DynamicCache.from_legacy_cache(past_key_values)
967
-
968
- if cache_position is None:
969
- past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
970
- cache_position = torch.arange(
971
- past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
972
- )
973
- if position_ids is None:
974
- position_ids = cache_position.unsqueeze(0)
975
-
976
- causal_mask = self._update_causal_mask(
977
- attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
978
- )
979
-
980
- # embed positions
981
- hidden_states = inputs_embeds
982
-
983
- # decoder layers
984
- all_hidden_states = () if output_hidden_states else None
985
- all_self_attns = () if output_attentions else None
986
- next_decoder_cache = None
987
-
988
- for decoder_layer in self.layers:
989
- if output_hidden_states:
990
- all_hidden_states += (hidden_states,)
991
-
992
- if self.gradient_checkpointing and self.training:
993
- layer_outputs = self._gradient_checkpointing_func(
994
- decoder_layer.__call__,
995
- hidden_states,
996
- causal_mask,
997
- position_ids,
998
- past_key_values,
999
- output_attentions,
1000
- use_cache,
1001
- cache_position,
1002
- )
1003
- else:
1004
- layer_outputs = decoder_layer(
1005
- hidden_states,
1006
- attention_mask=causal_mask,
1007
- position_ids=position_ids,
1008
- past_key_value=past_key_values,
1009
- output_attentions=output_attentions,
1010
- use_cache=use_cache,
1011
- cache_position=cache_position,
1012
- )
1013
-
1014
- hidden_states = layer_outputs[0]
1015
-
1016
- if use_cache:
1017
- next_decoder_cache = layer_outputs[2 if output_attentions else 1]
1018
-
1019
- if output_attentions:
1020
- all_self_attns += (layer_outputs[1],)
1021
-
1022
- hidden_states = self.norm(hidden_states)
1023
-
1024
- # add hidden states from the last decoder layer
1025
- if output_hidden_states:
1026
- all_hidden_states += (hidden_states,)
1027
-
1028
- next_cache = next_decoder_cache if use_cache else None
1029
- if return_legacy_cache:
1030
- next_cache = next_cache.to_legacy_cache()
1031
-
1032
- if not return_dict:
1033
- return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
1034
- return BaseModelOutputWithPast(
1035
- last_hidden_state=hidden_states,
1036
- past_key_values=next_cache,
1037
- hidden_states=all_hidden_states,
1038
- attentions=all_self_attns,
1039
- )
1040
-
1041
- def _update_causal_mask(
1042
- self,
1043
- attention_mask: torch.Tensor,
1044
- input_tensor: torch.Tensor,
1045
- cache_position: torch.Tensor,
1046
- past_key_values: Cache,
1047
- output_attentions: bool,
1048
- ):
1049
- # TODO: As of torch==2.2.0, the `attention_mask` passed to the model in `generate` is 2D and of dynamic length
1050
- # even when the static KV cache is used. This is an issue for torch.compile which then recaptures cudagraphs at
1051
- # each decode steps due to the dynamic shapes. (`recording cudagraph tree for symint key 13`, etc.), which is
1052
- # VERY slow. A workaround is `@torch.compiler.disable`, but this prevents using `fullgraph=True`.
1053
- # See more context in https://github.com/huggingface/transformers/pull/29114
1054
-
1055
- if self.config.attn_implementation == "flash_attention_2":
1056
- if attention_mask is not None and 0.0 in attention_mask:
1057
- return attention_mask
1058
- return None
1059
-
1060
- # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
1061
- # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
1062
- # to infer the attention mask.
1063
- past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
1064
- using_static_cache = isinstance(past_key_values, StaticCache)
1065
-
1066
- # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
1067
- if self.config.attn_implementation == "sdpa" and not using_static_cache and not output_attentions:
1068
- if AttentionMaskConverter._ignore_causal_mask_sdpa(
1069
- attention_mask,
1070
- inputs_embeds=input_tensor,
1071
- past_key_values_length=past_seen_tokens,
1072
- is_training=self.training,
1073
- ):
1074
- return None
1075
-
1076
- dtype, device = input_tensor.dtype, input_tensor.device
1077
- min_dtype = torch.finfo(dtype).min
1078
- sequence_length = input_tensor.shape[1]
1079
- if using_static_cache:
1080
- target_length = past_key_values.get_max_length()
1081
- else:
1082
- target_length = (
1083
- attention_mask.shape[-1]
1084
- if isinstance(attention_mask, torch.Tensor)
1085
- else past_seen_tokens + sequence_length + 1
1086
- )
1087
-
1088
- if attention_mask is not None and attention_mask.dim() == 4:
1089
- # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
1090
- if attention_mask.max() != 0:
1091
- raise ValueError("Custom 4D attention mask should be passed in inverted form with max==0`")
1092
- causal_mask = attention_mask
1093
- else:
1094
- causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
1095
- if sequence_length != 1:
1096
- causal_mask = torch.triu(causal_mask, diagonal=1)
1097
- causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
1098
- causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
1099
- if attention_mask is not None:
1100
- causal_mask = causal_mask.clone() # copy to contiguous memory for in-place edit
1101
- mask_length = attention_mask.shape[-1]
1102
- padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :]
1103
- padding_mask = padding_mask == 0
1104
- causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
1105
- padding_mask, min_dtype
1106
- )
1107
- if (
1108
- self.config.attn_implementation == "sdpa"
1109
- and attention_mask is not None
1110
- and attention_mask.device.type == "cuda"
1111
- and not output_attentions
1112
- ):
1113
- # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
1114
- # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
1115
- # Details: https://github.com/pytorch/pytorch/issues/110213
1116
- causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype) # pylint: disable=E1120
1117
-
1118
- return causal_mask
1119
-
1120
-
1121
- # Modified from transformers.models.llama.modeling_llama.LlamaForCausalLM
1122
- class InternLM2ForCausalLM(InternLM2PreTrainedModel):
1123
- """Causal language model (CLM) for InternLM2."""
1124
-
1125
- _auto_class = "AutoModelForCausalLM"
1126
- _tied_weights_keys = ["output.weight"]
1127
-
1128
- def __init__(self, config):
1129
- super().__init__(config)
1130
- self.model = InternLM2Model(config)
1131
- self.vocab_size = config.vocab_size
1132
- self.output = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1133
-
1134
- # Initialize weights and apply final processing
1135
- self.post_init()
1136
-
1137
- def get_input_embeddings(self):
1138
- return self.model.tok_embeddings
1139
-
1140
- def set_input_embeddings(self, value):
1141
- self.model.tok_embeddings = value
1142
-
1143
- def get_output_embeddings(self):
1144
- return self.output
1145
-
1146
- def set_output_embeddings(self, new_embeddings):
1147
- self.output = new_embeddings
1148
-
1149
- def set_decoder(self, decoder):
1150
- self.model = decoder
1151
-
1152
- def get_decoder(self):
1153
- return self.model
1154
-
1155
- @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
1156
- @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
1157
- def forward(
1158
- self,
1159
- input_ids: torch.LongTensor = None,
1160
- attention_mask: Optional[torch.Tensor] = None,
1161
- position_ids: Optional[torch.LongTensor] = None,
1162
- past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
1163
- inputs_embeds: Optional[torch.FloatTensor] = None,
1164
- labels: Optional[torch.LongTensor] = None,
1165
- use_cache: Optional[bool] = None,
1166
- output_attentions: Optional[bool] = None,
1167
- output_hidden_states: Optional[bool] = None,
1168
- return_dict: Optional[bool] = None,
1169
- cache_position: Optional[torch.LongTensor] = None,
1170
- ) -> Union[Tuple, CausalLMOutputWithPast]:
1171
- r"""
1172
- Args:
1173
- labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1174
- Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1175
- config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1176
- (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1177
-
1178
- Returns:
1179
-
1180
- Example:
1181
-
1182
- ```python
1183
- >>> from transformers import AutoTokenizer, InternLM2ForCausalLM
1184
-
1185
- >>> model = InternLM2ForCausalLM.from_pretrained("meta-InternLM2/InternLM2-2-7b-hf")
1186
- >>> tokenizer = AutoTokenizer.from_pretrained("meta-InternLM2/InternLM2-2-7b-hf")
1187
-
1188
- >>> prompt = "Hey, are you conscious? Can you talk to me?"
1189
- >>> inputs = tokenizer(prompt, return_tensors="pt")
1190
-
1191
- >>> # Generate
1192
- >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1193
- >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
1194
- "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
1195
- ```"""
1196
-
1197
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
1198
- output_hidden_states = (
1199
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
1200
- )
1201
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1202
-
1203
- # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
1204
- outputs = self.model(
1205
- input_ids=input_ids,
1206
- attention_mask=attention_mask,
1207
- position_ids=position_ids,
1208
- past_key_values=past_key_values,
1209
- inputs_embeds=inputs_embeds,
1210
- use_cache=use_cache,
1211
- output_attentions=output_attentions,
1212
- output_hidden_states=output_hidden_states,
1213
- return_dict=return_dict,
1214
- cache_position=cache_position,
1215
- )
1216
-
1217
- hidden_states = outputs[0]
1218
- if self.config.pretraining_tp > 1:
1219
- output_slices = self.output.weight.split(self.vocab_size // self.config.pretraining_tp, dim=0)
1220
- logits = [
1221
- F.linear(hidden_states, output_slices[i]) # pylint: disable=not-callable
1222
- for i in range(self.config.pretraining_tp)
1223
- ]
1224
- logits = torch.cat(logits, dim=-1)
1225
- else:
1226
- logits = self.output(hidden_states)
1227
- logits = logits.float()
1228
-
1229
- loss = None
1230
- if labels is not None:
1231
- # Shift so that tokens < n predict n
1232
- shift_logits = logits[..., :-1, :].contiguous()
1233
- shift_labels = labels[..., 1:].contiguous()
1234
- # Flatten the tokens
1235
- loss_fct = CrossEntropyLoss()
1236
- shift_logits = shift_logits.view(-1, self.config.vocab_size)
1237
- shift_labels = shift_labels.view(-1)
1238
- # Enable model parallelism
1239
- shift_labels = shift_labels.to(shift_logits.device)
1240
- loss = loss_fct(shift_logits, shift_labels)
1241
-
1242
- if not return_dict:
1243
- output = (logits,) + outputs[1:]
1244
- return (loss,) + output if loss is not None else output
1245
-
1246
- return CausalLMOutputWithPast(
1247
- loss=loss,
1248
- logits=logits,
1249
- past_key_values=outputs.past_key_values,
1250
- hidden_states=outputs.hidden_states,
1251
- attentions=outputs.attentions,
1252
- )
1253
-
1254
- def prepare_inputs_for_generation(
1255
- self,
1256
- input_ids,
1257
- past_key_values=None,
1258
- attention_mask=None,
1259
- inputs_embeds=None,
1260
- cache_position=None,
1261
- use_cache=True,
1262
- **kwargs,
1263
- ):
1264
- past_length = 0
1265
- if past_key_values is not None:
1266
- if isinstance(past_key_values, Cache):
1267
- past_length = cache_position[0] if cache_position is not None else past_key_values.get_seq_length()
1268
- max_cache_length = (
1269
- torch.tensor(past_key_values.get_max_length(), device=input_ids.device)
1270
- if past_key_values.get_max_length() is not None
1271
- else None
1272
- )
1273
- cache_length = past_length if max_cache_length is None else torch.min(max_cache_length, past_length)
1274
- # TODO joao: remove this `else` after `generate` prioritizes `Cache` objects
1275
- else:
1276
- cache_length = past_length = past_key_values[0][0].shape[2]
1277
- max_cache_length = None
1278
-
1279
- # Keep only the unprocessed tokens:
1280
- # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
1281
- # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as input)
1282
- if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
1283
- input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
1284
- # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
1285
- # input_ids based on the past_length.
1286
- elif past_length < input_ids.shape[1]:
1287
- input_ids = input_ids[:, past_length:]
1288
- # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
1289
-
1290
- # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
1291
- if (
1292
- max_cache_length is not None
1293
- and attention_mask is not None
1294
- and cache_length + input_ids.shape[1] > max_cache_length
1295
- ):
1296
- attention_mask = attention_mask[:, -max_cache_length:] # pylint: disable=E1130
1297
-
1298
- position_ids = kwargs.get("position_ids", None)
1299
- if attention_mask is not None and position_ids is None:
1300
- # create position_ids on the fly for batch generation
1301
- position_ids = attention_mask.long().cumsum(-1) - 1
1302
- position_ids.masked_fill_(attention_mask == 0, 1)
1303
- if past_key_values:
1304
- position_ids = position_ids[:, -input_ids.shape[1] :]
1305
-
1306
- # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
1307
- if inputs_embeds is not None and past_key_values is None:
1308
- model_inputs = {"inputs_embeds": inputs_embeds}
1309
- else:
1310
- # The `contiguous()` here is necessary to have a static stride during decoding. torchdynamo otherwise
1311
- # recompiles graphs as the stride of the inputs is a guard.
1312
- # Ref: https://github.com/huggingface/transformers/pull/29114
1313
- # TODO: use `next_tokens` directly instead.
1314
- model_inputs = {"input_ids": input_ids.contiguous()}
1315
-
1316
- input_length = position_ids.shape[-1] if position_ids is not None else input_ids.shape[-1]
1317
- if cache_position is None:
1318
- cache_position = torch.arange(past_length, past_length + input_length, device=input_ids.device)
1319
- elif use_cache:
1320
- cache_position = cache_position[-input_length:]
1321
-
1322
- model_inputs.update(
1323
- {
1324
- "position_ids": position_ids,
1325
- "cache_position": cache_position,
1326
- "past_key_values": past_key_values,
1327
- "use_cache": use_cache,
1328
- "attention_mask": attention_mask,
1329
- }
1330
- )
1331
- return model_inputs
1332
-
1333
- @staticmethod
1334
- def _reorder_cache(past_key_values, beam_idx):
1335
- reordered_past = ()
1336
- for layer_past in past_key_values:
1337
- reordered_past += (
1338
- tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
1339
- )
1340
- return reordered_past
1341
-
1342
- def build_inputs(self, tokenizer, query: str, history: List[Tuple[str, str]] = None, meta_instruction=""):
1343
- if history is None:
1344
- history = []
1345
- if tokenizer.add_bos_token:
1346
- prompt = ""
1347
- else:
1348
- prompt = tokenizer.bos_token
1349
- if meta_instruction:
1350
- prompt += f"""<|im_start|>system\n{meta_instruction}<|im_end|>\n"""
1351
- for record in history:
1352
- prompt += f"""<|im_start|>user\n{record[0]}<|im_end|>\n<|im_start|>assistant\n{record[1]}<|im_end|>\n"""
1353
- prompt += f"""<|im_start|>user\n{query}<|im_end|>\n<|im_start|>assistant\n"""
1354
- return tokenizer([prompt], return_tensors="pt")
1355
-
1356
- @torch.no_grad()
1357
- def chat(
1358
- self,
1359
- tokenizer,
1360
- query: str,
1361
- history: Optional[List[Tuple[str, str]]] = None,
1362
- streamer: Optional[BaseStreamer] = None,
1363
- max_new_tokens: int = 1024,
1364
- do_sample: bool = True,
1365
- temperature: float = 0.8,
1366
- top_p: float = 0.8,
1367
- meta_instruction: str = "You are an AI assistant whose name is InternLM (书生·浦语).\n"
1368
- "- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory "
1369
- "(上海人工智能实验室). It is designed to be helpful, honest, and harmless.\n"
1370
- "- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such "
1371
- "as English and 中文.",
1372
- **kwargs,
1373
- ):
1374
- if history is None:
1375
- history = []
1376
- inputs = self.build_inputs(tokenizer, query, history, meta_instruction)
1377
- inputs = {k: v.to(self.device) for k, v in inputs.items() if torch.is_tensor(v)}
1378
- # also add end-of-assistant token in eos token id to avoid unnecessary generation
1379
- eos_token_id = [tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids(["<|im_end|>"])[0]]
1380
- outputs = self.generate(
1381
- **inputs,
1382
- streamer=streamer,
1383
- max_new_tokens=max_new_tokens,
1384
- do_sample=do_sample,
1385
- temperature=temperature,
1386
- top_p=top_p,
1387
- eos_token_id=eos_token_id,
1388
- **kwargs,
1389
- )
1390
- outputs = outputs[0].cpu().tolist()[len(inputs["input_ids"][0]) :]
1391
- response = tokenizer.decode(outputs, skip_special_tokens=True)
1392
- response = response.split("<|im_end|>")[0]
1393
- history = history + [(query, response)]
1394
- return response, history
1395
-
1396
- @torch.no_grad()
1397
- def stream_chat(
1398
- self,
1399
- tokenizer,
1400
- query: str,
1401
- history: List[Tuple[str, str]] = None,
1402
- max_new_tokens: int = 1024,
1403
- do_sample: bool = True,
1404
- temperature: float = 0.8,
1405
- top_p: float = 0.8,
1406
- **kwargs,
1407
- ):
1408
- if history is None:
1409
- history = []
1410
- """
1411
- Return a generator in format: (response, history)
1412
- Eg.
1413
- ('你好,有什么可以帮助您的吗', [('你好', '你好,有什么可以帮助您的吗')])
1414
- ('你好,有什么可以帮助您的吗?', [('你好', '你好,有什么可以帮助您的吗?')])
1415
- """
1416
- if BaseStreamer is None:
1417
- raise ModuleNotFoundError(
1418
- "The version of `transformers` is too low. Please make sure "
1419
- "that you have installed `transformers>=4.28.0`."
1420
- )
1421
-
1422
- response_queue = queue.Queue(maxsize=20)
1423
-
1424
- class ChatStreamer(BaseStreamer):
1425
- """
1426
- Streamer used in generate to print words one by one.
1427
- """
1428
-
1429
- def __init__(self, tokenizer) -> None:
1430
- super().__init__()
1431
- self.tokenizer = tokenizer
1432
- self.queue = response_queue
1433
- self.query = query
1434
- self.history = history
1435
- self.response = ""
1436
- self.cache = []
1437
- self.received_inputs = False
1438
- self.queue.put((self.response, history + [(self.query, self.response)]))
1439
-
1440
- def put(self, value):
1441
- if len(value.shape) > 1 and value.shape[0] > 1:
1442
- raise ValueError("ChatStreamer only supports batch size 1")
1443
- elif len(value.shape) > 1:
1444
- value = value[0]
1445
-
1446
- if not self.received_inputs:
1447
- # The first received value is input_ids, ignore here
1448
- self.received_inputs = True
1449
- return
1450
-
1451
- self.cache.extend(value.tolist())
1452
- token = self.tokenizer.decode(self.cache, skip_special_tokens=True)
1453
- if token.strip() != "<|im_end|>":
1454
- self.response = self.response + token
1455
- history = self.history + [(self.query, self.response)]
1456
- self.queue.put((self.response, history))
1457
- self.cache = []
1458
- else:
1459
- self.end()
1460
-
1461
- def end(self):
1462
- self.queue.put(None)
1463
-
1464
- def stream_producer():
1465
- return self.chat(
1466
- tokenizer=tokenizer,
1467
- query=query,
1468
- streamer=ChatStreamer(tokenizer=tokenizer),
1469
- history=history,
1470
- max_new_tokens=max_new_tokens,
1471
- do_sample=do_sample,
1472
- temperature=temperature,
1473
- top_p=top_p,
1474
- **kwargs,
1475
- )
1476
-
1477
- def consumer():
1478
- producer = threading.Thread(target=stream_producer)
1479
- producer.start()
1480
- while True:
1481
- res = response_queue.get()
1482
- if res is None:
1483
- return
1484
- yield res
1485
-
1486
- return consumer()
1487
-
1488
-
1489
- # Copied from transformers.models.llama.modeling_llama.LlamaForSequenceClassification with Llama->InternLM2
1490
- @add_start_docstrings(
1491
- """
1492
- The InternLM2 Model transformer with a sequence classification head on top (linear layer).
1493
-
1494
- [`InternLM2ForSequenceClassification`] uses the last token in order to do the classification, as other causal models
1495
- (e.g. GPT-2) do.
1496
-
1497
- Since it does classification on the last token, it requires to know the position of the last token. If a
1498
- `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
1499
- no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
1500
- padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
1501
- each row of the batch).
1502
- """,
1503
- InternLM2_START_DOCSTRING,
1504
- )
1505
- class InternLM2ForSequenceClassification(InternLM2PreTrainedModel):
1506
- """Sequence Classification Head for InternLM2 Model."""
1507
-
1508
- def __init__(self, config):
1509
- super().__init__(config)
1510
- self.num_labels = config.num_labels
1511
- self.model = InternLM2Model(config)
1512
- self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
1513
-
1514
- # Initialize weights and apply final processing
1515
- self.post_init()
1516
-
1517
- def get_input_embeddings(self):
1518
- return self.model.tok_embeddings
1519
-
1520
- def set_input_embeddings(self, value):
1521
- self.model.tok_embeddings = value
1522
-
1523
- @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
1524
- def forward(
1525
- self,
1526
- input_ids: torch.LongTensor = None,
1527
- attention_mask: Optional[torch.Tensor] = None,
1528
- position_ids: Optional[torch.LongTensor] = None,
1529
- past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
1530
- inputs_embeds: Optional[torch.FloatTensor] = None,
1531
- labels: Optional[torch.LongTensor] = None,
1532
- use_cache: Optional[bool] = None,
1533
- output_attentions: Optional[bool] = None,
1534
- output_hidden_states: Optional[bool] = None,
1535
- return_dict: Optional[bool] = None,
1536
- ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
1537
- r"""
1538
- labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1539
- Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1540
- config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1541
- `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1542
- """
1543
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1544
-
1545
- transformer_outputs = self.model(
1546
- input_ids,
1547
- attention_mask=attention_mask,
1548
- position_ids=position_ids,
1549
- past_key_values=past_key_values,
1550
- inputs_embeds=inputs_embeds,
1551
- use_cache=use_cache,
1552
- output_attentions=output_attentions,
1553
- output_hidden_states=output_hidden_states,
1554
- return_dict=return_dict,
1555
- )
1556
- hidden_states = transformer_outputs[0]
1557
- logits = self.score(hidden_states)
1558
-
1559
- if input_ids is not None:
1560
- batch_size = input_ids.shape[0]
1561
- else:
1562
- batch_size = inputs_embeds.shape[0]
1563
-
1564
- if self.config.pad_token_id is None and batch_size != 1:
1565
- raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
1566
- if self.config.pad_token_id is None:
1567
- sequence_lengths = -1
1568
- else:
1569
- if input_ids is not None:
1570
- # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
1571
- sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
1572
- sequence_lengths = sequence_lengths % input_ids.shape[-1]
1573
- sequence_lengths = sequence_lengths.to(logits.device)
1574
- else:
1575
- sequence_lengths = -1
1576
-
1577
- pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
1578
-
1579
- loss = None
1580
- if labels is not None:
1581
- labels = labels.to(logits.device)
1582
- if self.config.problem_type is None:
1583
- if self.num_labels == 1:
1584
- self.config.problem_type = "regression"
1585
- elif self.num_labels > 1 and (labels.dtype in (torch.long, torch.int)):
1586
- self.config.problem_type = "single_label_classification"
1587
- else:
1588
- self.config.problem_type = "multi_label_classification"
1589
-
1590
- if self.config.problem_type == "regression":
1591
- loss_fct = MSELoss()
1592
- if self.num_labels == 1:
1593
- loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
1594
- else:
1595
- loss = loss_fct(pooled_logits, labels)
1596
- elif self.config.problem_type == "single_label_classification":
1597
- loss_fct = CrossEntropyLoss()
1598
- loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
1599
- elif self.config.problem_type == "multi_label_classification":
1600
- loss_fct = BCEWithLogitsLoss()
1601
- loss = loss_fct(pooled_logits, labels)
1602
- if not return_dict:
1603
- output = (pooled_logits,) + transformer_outputs[1:]
1604
- return ((loss,) + output) if loss is not None else output
1605
-
1606
- return SequenceClassifierOutputWithPast(
1607
- loss=loss,
1608
- logits=pooled_logits,
1609
- past_key_values=transformer_outputs.past_key_values,
1610
- hidden_states=transformer_outputs.hidden_states,
1611
- attentions=transformer_outputs.attentions,
1612
- )
1613
-
1614
-
1615
- # Copied from transformers.models.llama.modeling_llama.LlamaForQuestionAnswering with Llama->InternLM2
1616
- @add_start_docstrings(
1617
- """
1618
- The InternLM2 Model transformer with a span classification head on top for extractive question-answering tasks like
1619
- SQuAD (a linear layer on top of the hidden-states output to compute `span start logits` and `span end logits`).
1620
- """,
1621
- InternLM2_START_DOCSTRING,
1622
- )
1623
- class InternLM2ForQuestionAnswering(InternLM2PreTrainedModel):
1624
- """Question Answering model for InternLM2."""
1625
-
1626
- base_model_prefix = "transformer"
1627
-
1628
- def __init__(self, config):
1629
- super().__init__(config)
1630
- self.transformer = InternLM2Model(config)
1631
- self.qa_outputs = nn.Linear(config.hidden_size, 2)
1632
-
1633
- # Initialize weights and apply final processing
1634
- self.post_init()
1635
-
1636
- def get_input_embeddings(self):
1637
- return self.transformer.tok_embeddings
1638
-
1639
- def set_input_embeddings(self, value):
1640
- self.transformer.tok_embeddings = value
1641
-
1642
- @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
1643
- def forward(
1644
- self,
1645
- input_ids: Optional[torch.LongTensor] = None,
1646
- attention_mask: Optional[torch.FloatTensor] = None,
1647
- position_ids: Optional[torch.LongTensor] = None,
1648
- past_key_values: Optional[Union[Cache, List[torch.FloatTensor]]] = None,
1649
- inputs_embeds: Optional[torch.FloatTensor] = None,
1650
- start_positions: Optional[torch.LongTensor] = None,
1651
- end_positions: Optional[torch.LongTensor] = None,
1652
- output_attentions: Optional[bool] = None,
1653
- output_hidden_states: Optional[bool] = None,
1654
- return_dict: Optional[bool] = None,
1655
- ) -> Union[Tuple, QuestionAnsweringModelOutput]:
1656
- r"""
1657
- start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1658
- Labels for position (index) of the start of the labelled span for computing the token classification loss.
1659
- Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1660
- are not taken into account for computing the loss.
1661
- end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1662
- Labels for position (index) of the end of the labelled span for computing the token classification loss.
1663
- Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
1664
- are not taken into account for computing the loss.
1665
- """
1666
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1667
-
1668
- outputs = self.transformer(
1669
- input_ids,
1670
- attention_mask=attention_mask,
1671
- position_ids=position_ids,
1672
- past_key_values=past_key_values,
1673
- inputs_embeds=inputs_embeds,
1674
- output_attentions=output_attentions,
1675
- output_hidden_states=output_hidden_states,
1676
- return_dict=return_dict,
1677
- )
1678
-
1679
- sequence_output = outputs[0]
1680
-
1681
- logits = self.qa_outputs(sequence_output)
1682
- start_logits, end_logits = logits.split(1, dim=-1)
1683
- start_logits = start_logits.squeeze(-1).contiguous()
1684
- end_logits = end_logits.squeeze(-1).contiguous()
1685
-
1686
- total_loss = None
1687
- if start_positions is not None and end_positions is not None:
1688
- # If we are on multi-GPU, split add a dimension
1689
- if len(start_positions.size()) > 1:
1690
- start_positions = start_positions.squeeze(-1).to(start_logits.device)
1691
- if len(end_positions.size()) > 1:
1692
- end_positions = end_positions.squeeze(-1).to(end_logits.device)
1693
- # sometimes the start/end positions are outside our model inputs, we ignore these terms
1694
- ignored_index = start_logits.size(1)
1695
- start_positions = start_positions.clamp(0, ignored_index)
1696
- end_positions = end_positions.clamp(0, ignored_index)
1697
-
1698
- loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
1699
- start_loss = loss_fct(start_logits, start_positions)
1700
- end_loss = loss_fct(end_logits, end_positions)
1701
- total_loss = (start_loss + end_loss) / 2
1702
-
1703
- if not return_dict:
1704
- output = (start_logits, end_logits) + outputs[2:]
1705
- return ((total_loss,) + output) if total_loss is not None else output
1706
-
1707
- return QuestionAnsweringModelOutput(
1708
- loss=total_loss,
1709
- start_logits=start_logits,
1710
- end_logits=end_logits,
1711
- hidden_states=outputs.hidden_states,
1712
- attentions=outputs.attentions,
1713
- )
1714
-
1715
-
1716
- # Copied from transformers.models.llama.modeling_llama.LlamaForTokenClassification with Llama->InternLM2
1717
- @add_start_docstrings(
1718
- """
1719
- The InternLM2 Model transformer with a token classification head on top (a linear layer on top of the hidden-states
1720
- output) e.g. for Named-Entity-Recognition (NER) tasks.
1721
- """,
1722
- InternLM2_START_DOCSTRING,
1723
- )
1724
- class InternLM2ForTokenClassification(InternLM2PreTrainedModel):
1725
- """Token classification model for InternLM2."""
1726
-
1727
- def __init__(self, config):
1728
- super().__init__(config)
1729
- self.num_labels = config.num_labels
1730
- self.model = InternLM2Model(config)
1731
- if getattr(config, "classifier_dropout", None) is not None:
1732
- classifier_dropout = config.classifier_dropout
1733
- elif getattr(config, "hidden_dropout", None) is not None:
1734
- classifier_dropout = config.hidden_dropout
1735
- else:
1736
- classifier_dropout = 0.1
1737
- self.dropout = nn.Dropout(classifier_dropout)
1738
- self.score = nn.Linear(config.hidden_size, config.num_labels)
1739
-
1740
- # Initialize weights and apply final processing
1741
- self.post_init()
1742
-
1743
- def get_input_embeddings(self):
1744
- return self.model.tok_embeddings
1745
-
1746
- def set_input_embeddings(self, value):
1747
- self.model.tok_embeddings = value
1748
-
1749
- @add_start_docstrings_to_model_forward(InternLM2_INPUTS_DOCSTRING)
1750
- def forward(
1751
- self,
1752
- input_ids: torch.LongTensor = None,
1753
- attention_mask: Optional[torch.Tensor] = None,
1754
- position_ids: Optional[torch.LongTensor] = None,
1755
- past_key_values: Optional[List[torch.FloatTensor]] = None,
1756
- inputs_embeds: Optional[torch.FloatTensor] = None,
1757
- labels: Optional[torch.LongTensor] = None,
1758
- use_cache: Optional[bool] = None,
1759
- output_attentions: Optional[bool] = None,
1760
- output_hidden_states: Optional[bool] = None,
1761
- return_dict: Optional[bool] = None,
1762
- ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
1763
- r"""
1764
- labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
1765
- Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
1766
- config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
1767
- `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
1768
- """
1769
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
1770
-
1771
- outputs = self.model(
1772
- input_ids,
1773
- attention_mask=attention_mask,
1774
- position_ids=position_ids,
1775
- past_key_values=past_key_values,
1776
- inputs_embeds=inputs_embeds,
1777
- use_cache=use_cache,
1778
- output_attentions=output_attentions,
1779
- output_hidden_states=output_hidden_states,
1780
- return_dict=return_dict,
1781
- )
1782
- sequence_output = outputs[0]
1783
- sequence_output = self.dropout(sequence_output)
1784
- logits = self.score(sequence_output)
1785
-
1786
- loss = None
1787
- if labels is not None:
1788
- loss_fct = CrossEntropyLoss()
1789
- loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
1790
-
1791
- if not return_dict:
1792
- output = (logits,) + outputs[2:]
1793
- return ((loss,) + output) if loss is not None else output
1794
-
1795
- return TokenClassifierOutput(
1796
- loss=loss,
1797
- logits=logits,
1798
- hidden_states=outputs.hidden_states,
1799
- attentions=outputs.attentions,
1800
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/tokenization_internlm2.py DELETED
@@ -1,236 +0,0 @@
1
- # coding=utf-8
2
- # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
3
- #
4
- # This code is based on transformers/src/transformers/models/llama/tokenization_llama.py
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- #
10
- # http://www.apache.org/licenses/LICENSE-2.0
11
- #
12
- # Unless required by applicable law or agreed to in writing, software
13
- # distributed under the License is distributed on an "AS IS" BASIS,
14
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
- # See the License for the specific language governing permissions and
16
- # limitations under the License.
17
-
18
- """Tokenization classes for InternLM."""
19
- import os
20
- from shutil import copyfile
21
- from typing import Any, Dict, List, Optional, Tuple
22
-
23
- import sentencepiece as spm
24
- from transformers.tokenization_utils import PreTrainedTokenizer
25
- from transformers.utils import logging
26
-
27
- logger = logging.get_logger(__name__)
28
-
29
- VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
30
-
31
- PRETRAINED_VOCAB_FILES_MAP = {}
32
-
33
-
34
- # Modified from transformers.model.llama.tokenization_llama.LlamaTokenizer
35
- class InternLM2Tokenizer(PreTrainedTokenizer):
36
- """
37
- Construct a InternLM2 tokenizer. Based on byte-level Byte-Pair-Encoding.
38
-
39
- Args:
40
- vocab_file (`str`):
41
- Path to the vocabulary file.
42
- """
43
-
44
- vocab_files_names = VOCAB_FILES_NAMES
45
- pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
46
- model_input_names = ["input_ids", "attention_mask"]
47
- _auto_class = "AutoTokenizer"
48
-
49
- def __init__(
50
- self,
51
- vocab_file,
52
- unk_token="<unk>",
53
- bos_token="<s>",
54
- eos_token="</s>",
55
- pad_token="</s>",
56
- sp_model_kwargs: Optional[Dict[str, Any]] = None,
57
- add_bos_token=True,
58
- add_eos_token=False,
59
- decode_with_prefix_space=False,
60
- clean_up_tokenization_spaces=False,
61
- **kwargs,
62
- ):
63
- self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
64
- self.vocab_file = vocab_file
65
- self.add_bos_token = add_bos_token
66
- self.add_eos_token = add_eos_token
67
- self.decode_with_prefix_space = decode_with_prefix_space
68
- self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
69
- self.sp_model.Load(vocab_file)
70
- self._no_prefix_space_tokens = None
71
- super().__init__(
72
- bos_token=bos_token,
73
- eos_token=eos_token,
74
- unk_token=unk_token,
75
- pad_token=pad_token,
76
- clean_up_tokenization_spaces=clean_up_tokenization_spaces,
77
- **kwargs,
78
- )
79
-
80
- @property
81
- def no_prefix_space_tokens(self):
82
- if self._no_prefix_space_tokens is None:
83
- vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
84
- self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
85
- return self._no_prefix_space_tokens
86
-
87
- @property
88
- def vocab_size(self):
89
- """Returns vocab size"""
90
- return self.sp_model.get_piece_size()
91
-
92
- @property
93
- def bos_token_id(self) -> Optional[int]:
94
- return self.sp_model.bos_id()
95
-
96
- @property
97
- def eos_token_id(self) -> Optional[int]:
98
- return self.sp_model.eos_id()
99
-
100
- def get_vocab(self):
101
- """Returns vocab as a dict"""
102
- vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
103
- vocab.update(self.added_tokens_encoder)
104
- return vocab
105
-
106
- def _tokenize(self, text):
107
- """Returns a tokenized string."""
108
- return self.sp_model.encode(text, out_type=str)
109
-
110
- def _convert_token_to_id(self, token):
111
- """Converts a token (str) in an id using the vocab."""
112
- return self.sp_model.piece_to_id(token)
113
-
114
- def _convert_id_to_token(self, index):
115
- """Converts an index (integer) in a token (str) using the vocab."""
116
- token = self.sp_model.IdToPiece(index)
117
- return token
118
-
119
- def _maybe_add_prefix_space(self, tokens, decoded):
120
- if tokens and tokens[0] not in self.no_prefix_space_tokens:
121
- return " " + decoded
122
- else:
123
- return decoded
124
-
125
- def convert_tokens_to_string(self, tokens):
126
- """Converts a sequence of tokens (string) in a single string."""
127
- current_sub_tokens = []
128
- out_string = ""
129
- prev_is_special = False
130
- for token in tokens:
131
- # make sure that special tokens are not decoded using sentencepiece model
132
- if token in self.all_special_tokens:
133
- if not prev_is_special:
134
- out_string += " "
135
- out_string += self.sp_model.decode(current_sub_tokens) + token
136
- prev_is_special = True
137
- current_sub_tokens = []
138
- else:
139
- current_sub_tokens.append(token)
140
- prev_is_special = False
141
- out_string += self.sp_model.decode(current_sub_tokens)
142
- out_string = self.clean_up_tokenization(out_string)
143
- out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
144
- return out_string[1:]
145
-
146
- def save_vocabulary(self, save_directory, filename_prefix: Optional[str] = None) -> Tuple[str]:
147
- """
148
- Save the vocabulary and special tokens file to a directory.
149
-
150
- Args:
151
- save_directory (`str`):
152
- The directory in which to save the vocabulary.
153
-
154
- Returns:
155
- `Tuple(str)`: Paths to the files saved.
156
- """
157
- if not os.path.isdir(save_directory):
158
- logger.error(f"Vocabulary path ({save_directory}) should be a directory")
159
- return
160
- out_vocab_file = os.path.join(
161
- save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
162
- )
163
-
164
- if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
165
- copyfile(self.vocab_file, out_vocab_file)
166
- elif not os.path.isfile(self.vocab_file):
167
- with open(out_vocab_file, "wb") as fi:
168
- content_spiece_model = self.sp_model.serialized_model_proto()
169
- fi.write(content_spiece_model)
170
-
171
- return (out_vocab_file,)
172
-
173
- def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
174
- if self.add_bos_token:
175
- bos_token_ids = [self.bos_token_id]
176
- else:
177
- bos_token_ids = []
178
-
179
- output = bos_token_ids + token_ids_0
180
-
181
- if token_ids_1 is not None:
182
- output = output + token_ids_1
183
-
184
- if self.add_eos_token:
185
- output = output + [self.eos_token_id]
186
-
187
- return output
188
-
189
- def get_special_tokens_mask(
190
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
191
- ) -> List[int]:
192
- """
193
- Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
194
- special tokens using the tokenizer `prepare_for_model` method.
195
-
196
- Args:
197
- token_ids_0 (`List[int]`):
198
- List of IDs.
199
- token_ids_1 (`List[int]`, *optional*):
200
- Optional second list of IDs for sequence pairs.
201
- already_has_special_tokens (`bool`, *optional*, defaults to `False`):
202
- Whether or not the token list is already formatted with special tokens for the model.
203
-
204
- Returns:
205
- `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
206
- """
207
- if already_has_special_tokens:
208
- return super().get_special_tokens_mask(
209
- token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
210
- )
211
-
212
- if token_ids_1 is None:
213
- return [1] + ([0] * len(token_ids_0)) + [1]
214
- return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
215
-
216
- def create_token_type_ids_from_sequences(
217
- self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
218
- ) -> List[int]:
219
- """
220
- Create a mask from the two sequences passed to be used in a sequence-pair classification task. T5 does not make
221
- use of token type ids, therefore a list of zeros is returned.
222
-
223
- Args:
224
- token_ids_0 (`List[int]`):
225
- List of IDs.
226
- token_ids_1 (`List[int]`, *optional*):
227
- Optional second list of IDs for sequence pairs.
228
-
229
- Returns:
230
- `List[int]`: List of zeros.
231
- """
232
- eos = [self.eos_token_id]
233
-
234
- if token_ids_1 is None:
235
- return len(token_ids_0 + eos) * [0]
236
- return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/tokenization_internlm2_fast.py DELETED
@@ -1,214 +0,0 @@
1
- # coding=utf-8
2
- # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
3
- #
4
- # This code is based on transformers/src/transformers/models/llama/tokenization_llama_fast.py
5
- #
6
- # Licensed under the Apache License, Version 2.0 (the "License");
7
- # you may not use this file except in compliance with the License.
8
- # You may obtain a copy of the License at
9
- #
10
- # http://www.apache.org/licenses/LICENSE-2.0
11
- #
12
- # Unless required by applicable law or agreed to in writing, software
13
- # distributed under the License is distributed on an "AS IS" BASIS,
14
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
- # See the License for the specific language governing permissions and
16
- # limitations under the License.
17
-
18
- """Tokenization Fast class for InternLM."""
19
- import os
20
- from shutil import copyfile
21
- from typing import Any, Dict, Optional, Tuple
22
-
23
- from tokenizers import processors, decoders, Tokenizer, normalizers
24
- from tokenizers.models import BPE
25
-
26
- from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
27
- from transformers.utils import logging
28
-
29
- from transformers.convert_slow_tokenizer import (
30
- SLOW_TO_FAST_CONVERTERS,
31
- SpmConverter,
32
- SentencePieceExtractor,
33
- )
34
-
35
- from .tokenization_internlm2 import InternLM2Tokenizer
36
-
37
- logger = logging.get_logger(__name__)
38
-
39
- VOCAB_FILES_NAMES = {"vocab_file": "./tokenizer.model"}
40
-
41
- # Modified from transformers.convert_slow_tokenizer.LlamaConverter
42
- class InternLM2Converter(SpmConverter):
43
- handle_byte_fallback = True
44
-
45
- def vocab(self, proto):
46
- vocab = [
47
- ("<unk>", 0.0),
48
- ("<s>", 0.0),
49
- ("</s>", 0.0),
50
- ]
51
- vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
52
- return vocab
53
-
54
- def unk_id(self, proto):
55
- unk_id = 0
56
- return unk_id
57
-
58
- def decoder(self, replacement, add_prefix_space):
59
- decoders_sequence = [
60
- decoders.Replace("▁", " "),
61
- decoders.ByteFallback(),
62
- decoders.Fuse(),
63
- ]
64
- if self.proto.normalizer_spec.add_dummy_prefix:
65
- decoders_sequence.append(decoders.Strip(content=" ", left=1))
66
- return decoders.Sequence(decoders_sequence)
67
-
68
- def tokenizer(self, proto):
69
- model_type = proto.trainer_spec.model_type
70
- vocab_scores = self.vocab(proto)
71
- # special tokens
72
- added_tokens = self.original_tokenizer.added_tokens_decoder
73
- for i in range(len(vocab_scores)):
74
- piece, score = vocab_scores[i]
75
- if i in added_tokens:
76
- vocab_scores[i] = (added_tokens[i].content, score)
77
- if model_type == 1:
78
- raise RuntimeError("InternLM2 is supposed to be a BPE model!")
79
-
80
- elif model_type == 2:
81
- _, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
82
- bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
83
- tokenizer = Tokenizer(
84
- BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
85
- )
86
- tokenizer.add_special_tokens(
87
- [ added_token for index, added_token in added_tokens.items()]
88
- )
89
- else:
90
- raise Exception(
91
- "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
92
- )
93
-
94
- return tokenizer
95
-
96
- def normalizer(self, proto):
97
- normalizers_list = []
98
- if proto.normalizer_spec.add_dummy_prefix:
99
- normalizers_list.append(normalizers.Prepend(prepend="▁"))
100
- normalizers_list.append(normalizers.Replace(pattern=" ", content="▁"))
101
- return normalizers.Sequence(normalizers_list)
102
-
103
- def pre_tokenizer(self, replacement, add_prefix_space):
104
- return None
105
-
106
- SLOW_TO_FAST_CONVERTERS["InternLM2Tokenizer"] = InternLM2Converter
107
-
108
-
109
- # Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast
110
- class InternLM2TokenizerFast(PreTrainedTokenizerFast):
111
- vocab_files_names = VOCAB_FILES_NAMES
112
- slow_tokenizer_class = InternLM2Tokenizer
113
- padding_side = "left"
114
- model_input_names = ["input_ids", "attention_mask"]
115
- _auto_class = "AutoTokenizer"
116
-
117
- def __init__(
118
- self,
119
- vocab_file,
120
- unk_token="<unk>",
121
- bos_token="<s>",
122
- eos_token="</s>",
123
- pad_token="</s>",
124
- sp_model_kwargs: Optional[Dict[str, Any]] = None,
125
- add_bos_token=True,
126
- add_eos_token=False,
127
- decode_with_prefix_space=False,
128
- clean_up_tokenization_spaces=False,
129
- **kwargs,
130
- ):
131
- super().__init__(
132
- vocab_file=vocab_file,
133
- unk_token=unk_token,
134
- bos_token=bos_token,
135
- eos_token=eos_token,
136
- pad_token=pad_token,
137
- sp_model_kwargs=sp_model_kwargs,
138
- add_bos_token=add_bos_token,
139
- add_eos_token=add_eos_token,
140
- decode_with_prefix_space=decode_with_prefix_space,
141
- clean_up_tokenization_spaces=clean_up_tokenization_spaces,
142
- **kwargs,
143
- )
144
- self._add_bos_token = add_bos_token
145
- self._add_eos_token = add_eos_token
146
- self.update_post_processor()
147
- self.vocab_file = vocab_file
148
-
149
- @property
150
- def can_save_slow_tokenizer(self) -> bool:
151
- return os.path.isfile(self.vocab_file) if self.vocab_file else False
152
-
153
- def update_post_processor(self):
154
- """
155
- Updates the underlying post processor with the current `bos_token` and `eos_token`.
156
- """
157
- bos = self.bos_token
158
- bos_token_id = self.bos_token_id
159
- if bos is None and self.add_bos_token:
160
- raise ValueError("add_bos_token = True but bos_token = None")
161
-
162
- eos = self.eos_token
163
- eos_token_id = self.eos_token_id
164
- if eos is None and self.add_eos_token:
165
- raise ValueError("add_eos_token = True but eos_token = None")
166
-
167
- single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
168
- pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
169
-
170
- special_tokens = []
171
- if self.add_bos_token:
172
- special_tokens.append((bos, bos_token_id))
173
- if self.add_eos_token:
174
- special_tokens.append((eos, eos_token_id))
175
- self._tokenizer.post_processor = processors.TemplateProcessing(
176
- single=single, pair=pair, special_tokens=special_tokens
177
- )
178
-
179
- @property
180
- def add_eos_token(self):
181
- return self._add_eos_token
182
-
183
- @property
184
- def add_bos_token(self):
185
- return self._add_bos_token
186
-
187
- @add_eos_token.setter
188
- def add_eos_token(self, value):
189
- self._add_eos_token = value
190
- self.update_post_processor()
191
-
192
- @add_bos_token.setter
193
- def add_bos_token(self, value):
194
- self._add_bos_token = value
195
- self.update_post_processor()
196
-
197
- def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
198
- if not self.can_save_slow_tokenizer:
199
- raise ValueError(
200
- "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
201
- "tokenizer."
202
- )
203
-
204
- if not os.path.isdir(save_directory):
205
- logger.error(f"Vocabulary path ({save_directory}) should be a directory")
206
- return
207
- out_vocab_file = os.path.join(
208
- save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
209
- )
210
-
211
- if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
212
- copyfile(self.vocab_file, out_vocab_file)
213
-
214
- return (out_vocab_file,)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/tokenizer.json DELETED
The diff for this file is too large to render. See raw diff
 
llama-factory/merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/tokenizer_config.json DELETED
@@ -1,1640 +0,0 @@
1
- {
2
- "add_bos_token": true,
3
- "add_eos_token": false,
4
- "added_tokens_decoder": {
5
- "0": {
6
- "content": "<unk>",
7
- "lstrip": false,
8
- "normalized": false,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- },
13
- "1": {
14
- "content": "<s>",
15
- "lstrip": false,
16
- "normalized": false,
17
- "rstrip": false,
18
- "single_word": false,
19
- "special": true
20
- },
21
- "2": {
22
- "content": "</s>",
23
- "lstrip": false,
24
- "normalized": false,
25
- "rstrip": false,
26
- "single_word": false,
27
- "special": true
28
- },
29
- "92352": {
30
- "content": "E",
31
- "lstrip": false,
32
- "normalized": false,
33
- "rstrip": false,
34
- "single_word": false,
35
- "special": false
36
- },
37
- "92353": {
38
- "content": "F",
39
- "lstrip": false,
40
- "normalized": false,
41
- "rstrip": false,
42
- "single_word": false,
43
- "special": false
44
- },
45
- "92354": {
46
- "content": "G",
47
- "lstrip": false,
48
- "normalized": false,
49
- "rstrip": false,
50
- "single_word": false,
51
- "special": false
52
- },
53
- "92355": {
54
- "content": "H",
55
- "lstrip": false,
56
- "normalized": false,
57
- "rstrip": false,
58
- "single_word": false,
59
- "special": false
60
- },
61
- "92356": {
62
- "content": "I",
63
- "lstrip": false,
64
- "normalized": false,
65
- "rstrip": false,
66
- "single_word": false,
67
- "special": false
68
- },
69
- "92357": {
70
- "content": "J",
71
- "lstrip": false,
72
- "normalized": false,
73
- "rstrip": false,
74
- "single_word": false,
75
- "special": false
76
- },
77
- "92358": {
78
- "content": "K",
79
- "lstrip": false,
80
- "normalized": false,
81
- "rstrip": false,
82
- "single_word": false,
83
- "special": false
84
- },
85
- "92359": {
86
- "content": "L",
87
- "lstrip": false,
88
- "normalized": false,
89
- "rstrip": false,
90
- "single_word": false,
91
- "special": false
92
- },
93
- "92360": {
94
- "content": "M",
95
- "lstrip": false,
96
- "normalized": false,
97
- "rstrip": false,
98
- "single_word": false,
99
- "special": false
100
- },
101
- "92361": {
102
- "content": "N",
103
- "lstrip": false,
104
- "normalized": false,
105
- "rstrip": false,
106
- "single_word": false,
107
- "special": false
108
- },
109
- "92362": {
110
- "content": "R",
111
- "lstrip": false,
112
- "normalized": false,
113
- "rstrip": false,
114
- "single_word": false,
115
- "special": false
116
- },
117
- "92363": {
118
- "content": "U",
119
- "lstrip": false,
120
- "normalized": false,
121
- "rstrip": false,
122
- "single_word": false,
123
- "special": false
124
- },
125
- "92364": {
126
- "content": "V",
127
- "lstrip": false,
128
- "normalized": false,
129
- "rstrip": false,
130
- "single_word": false,
131
- "special": false
132
- },
133
- "92365": {
134
- "content": "W",
135
- "lstrip": false,
136
- "normalized": false,
137
- "rstrip": false,
138
- "single_word": false,
139
- "special": false
140
- },
141
- "92366": {
142
- "content": "X",
143
- "lstrip": false,
144
- "normalized": false,
145
- "rstrip": false,
146
- "single_word": false,
147
- "special": false
148
- },
149
- "92367": {
150
- "content": "Y",
151
- "lstrip": false,
152
- "normalized": false,
153
- "rstrip": false,
154
- "single_word": false,
155
- "special": false
156
- },
157
- "92368": {
158
- "content": "Z",
159
- "lstrip": false,
160
- "normalized": false,
161
- "rstrip": false,
162
- "single_word": false,
163
- "special": false
164
- },
165
- "92369": {
166
- "content": "a",
167
- "lstrip": false,
168
- "normalized": false,
169
- "rstrip": false,
170
- "single_word": false,
171
- "special": false
172
- },
173
- "92370": {
174
- "content": "b",
175
- "lstrip": false,
176
- "normalized": false,
177
- "rstrip": false,
178
- "single_word": false,
179
- "special": false
180
- },
181
- "92371": {
182
- "content": "c",
183
- "lstrip": false,
184
- "normalized": false,
185
- "rstrip": false,
186
- "single_word": false,
187
- "special": false
188
- },
189
- "92372": {
190
- "content": "d",
191
- "lstrip": false,
192
- "normalized": false,
193
- "rstrip": false,
194
- "single_word": false,
195
- "special": false
196
- },
197
- "92373": {
198
- "content": "e",
199
- "lstrip": false,
200
- "normalized": false,
201
- "rstrip": false,
202
- "single_word": false,
203
- "special": false
204
- },
205
- "92374": {
206
- "content": "f",
207
- "lstrip": false,
208
- "normalized": false,
209
- "rstrip": false,
210
- "single_word": false,
211
- "special": false
212
- },
213
- "92375": {
214
- "content": "g",
215
- "lstrip": false,
216
- "normalized": false,
217
- "rstrip": false,
218
- "single_word": false,
219
- "special": false
220
- },
221
- "92376": {
222
- "content": "h",
223
- "lstrip": false,
224
- "normalized": false,
225
- "rstrip": false,
226
- "single_word": false,
227
- "special": false
228
- },
229
- "92377": {
230
- "content": "i",
231
- "lstrip": false,
232
- "normalized": false,
233
- "rstrip": false,
234
- "single_word": false,
235
- "special": false
236
- },
237
- "92378": {
238
- "content": "j",
239
- "lstrip": false,
240
- "normalized": false,
241
- "rstrip": false,
242
- "single_word": false,
243
- "special": false
244
- },
245
- "92379": {
246
- "content": "k",
247
- "lstrip": false,
248
- "normalized": false,
249
- "rstrip": false,
250
- "single_word": false,
251
- "special": false
252
- },
253
- "92380": {
254
- "content": "l",
255
- "lstrip": false,
256
- "normalized": false,
257
- "rstrip": false,
258
- "single_word": false,
259
- "special": false
260
- },
261
- "92381": {
262
- "content": "m",
263
- "lstrip": false,
264
- "normalized": false,
265
- "rstrip": false,
266
- "single_word": false,
267
- "special": false
268
- },
269
- "92382": {
270
- "content": "n",
271
- "lstrip": false,
272
- "normalized": false,
273
- "rstrip": false,
274
- "single_word": false,
275
- "special": false
276
- },
277
- "92383": {
278
- "content": "o",
279
- "lstrip": false,
280
- "normalized": false,
281
- "rstrip": false,
282
- "single_word": false,
283
- "special": false
284
- },
285
- "92384": {
286
- "content": "p",
287
- "lstrip": false,
288
- "normalized": false,
289
- "rstrip": false,
290
- "single_word": false,
291
- "special": false
292
- },
293
- "92385": {
294
- "content": "q",
295
- "lstrip": false,
296
- "normalized": false,
297
- "rstrip": false,
298
- "single_word": false,
299
- "special": false
300
- },
301
- "92386": {
302
- "content": "r",
303
- "lstrip": false,
304
- "normalized": false,
305
- "rstrip": false,
306
- "single_word": false,
307
- "special": false
308
- },
309
- "92387": {
310
- "content": "s",
311
- "lstrip": false,
312
- "normalized": false,
313
- "rstrip": false,
314
- "single_word": false,
315
- "special": false
316
- },
317
- "92388": {
318
- "content": "t",
319
- "lstrip": false,
320
- "normalized": false,
321
- "rstrip": false,
322
- "single_word": false,
323
- "special": false
324
- },
325
- "92389": {
326
- "content": "u",
327
- "lstrip": false,
328
- "normalized": false,
329
- "rstrip": false,
330
- "single_word": false,
331
- "special": false
332
- },
333
- "92390": {
334
- "content": "v",
335
- "lstrip": false,
336
- "normalized": false,
337
- "rstrip": false,
338
- "single_word": false,
339
- "special": false
340
- },
341
- "92391": {
342
- "content": "w",
343
- "lstrip": false,
344
- "normalized": false,
345
- "rstrip": false,
346
- "single_word": false,
347
- "special": false
348
- },
349
- "92392": {
350
- "content": "x",
351
- "lstrip": false,
352
- "normalized": false,
353
- "rstrip": false,
354
- "single_word": false,
355
- "special": false
356
- },
357
- "92393": {
358
- "content": "y",
359
- "lstrip": false,
360
- "normalized": false,
361
- "rstrip": false,
362
- "single_word": false,
363
- "special": false
364
- },
365
- "92394": {
366
- "content": "z",
367
- "lstrip": false,
368
- "normalized": false,
369
- "rstrip": false,
370
- "single_word": false,
371
- "special": false
372
- },
373
- "92395": {
374
- "content": "——",
375
- "lstrip": false,
376
- "normalized": false,
377
- "rstrip": false,
378
- "single_word": false,
379
- "special": false
380
- },
381
- "92396": {
382
- "content": "……",
383
- "lstrip": false,
384
- "normalized": false,
385
- "rstrip": false,
386
- "single_word": false,
387
- "special": false
388
- },
389
- "92397": {
390
- "content": "[UNUSED_TOKEN_0]",
391
- "lstrip": false,
392
- "normalized": false,
393
- "rstrip": false,
394
- "single_word": false,
395
- "special": false
396
- },
397
- "92398": {
398
- "content": "[UNUSED_TOKEN_1]",
399
- "lstrip": false,
400
- "normalized": false,
401
- "rstrip": false,
402
- "single_word": false,
403
- "special": false
404
- },
405
- "92399": {
406
- "content": "[UNUSED_TOKEN_2]",
407
- "lstrip": false,
408
- "normalized": false,
409
- "rstrip": false,
410
- "single_word": false,
411
- "special": false
412
- },
413
- "92400": {
414
- "content": "[UNUSED_TOKEN_3]",
415
- "lstrip": false,
416
- "normalized": false,
417
- "rstrip": false,
418
- "single_word": false,
419
- "special": false
420
- },
421
- "92401": {
422
- "content": "[UNUSED_TOKEN_4]",
423
- "lstrip": false,
424
- "normalized": false,
425
- "rstrip": false,
426
- "single_word": false,
427
- "special": false
428
- },
429
- "92402": {
430
- "content": "[UNUSED_TOKEN_5]",
431
- "lstrip": false,
432
- "normalized": false,
433
- "rstrip": false,
434
- "single_word": false,
435
- "special": false
436
- },
437
- "92403": {
438
- "content": "[UNUSED_TOKEN_6]",
439
- "lstrip": false,
440
- "normalized": false,
441
- "rstrip": false,
442
- "single_word": false,
443
- "special": false
444
- },
445
- "92404": {
446
- "content": "[UNUSED_TOKEN_7]",
447
- "lstrip": false,
448
- "normalized": false,
449
- "rstrip": false,
450
- "single_word": false,
451
- "special": false
452
- },
453
- "92405": {
454
- "content": "[UNUSED_TOKEN_8]",
455
- "lstrip": false,
456
- "normalized": false,
457
- "rstrip": false,
458
- "single_word": false,
459
- "special": false
460
- },
461
- "92406": {
462
- "content": "[UNUSED_TOKEN_9]",
463
- "lstrip": false,
464
- "normalized": false,
465
- "rstrip": false,
466
- "single_word": false,
467
- "special": false
468
- },
469
- "92407": {
470
- "content": "[UNUSED_TOKEN_10]",
471
- "lstrip": false,
472
- "normalized": false,
473
- "rstrip": false,
474
- "single_word": false,
475
- "special": false
476
- },
477
- "92408": {
478
- "content": "[UNUSED_TOKEN_11]",
479
- "lstrip": false,
480
- "normalized": false,
481
- "rstrip": false,
482
- "single_word": false,
483
- "special": false
484
- },
485
- "92409": {
486
- "content": "[UNUSED_TOKEN_12]",
487
- "lstrip": false,
488
- "normalized": false,
489
- "rstrip": false,
490
- "single_word": false,
491
- "special": false
492
- },
493
- "92410": {
494
- "content": "[UNUSED_TOKEN_13]",
495
- "lstrip": false,
496
- "normalized": false,
497
- "rstrip": false,
498
- "single_word": false,
499
- "special": false
500
- },
501
- "92411": {
502
- "content": "[UNUSED_TOKEN_14]",
503
- "lstrip": false,
504
- "normalized": false,
505
- "rstrip": false,
506
- "single_word": false,
507
- "special": false
508
- },
509
- "92412": {
510
- "content": "[UNUSED_TOKEN_15]",
511
- "lstrip": false,
512
- "normalized": false,
513
- "rstrip": false,
514
- "single_word": false,
515
- "special": false
516
- },
517
- "92413": {
518
- "content": "[UNUSED_TOKEN_16]",
519
- "lstrip": false,
520
- "normalized": false,
521
- "rstrip": false,
522
- "single_word": false,
523
- "special": false
524
- },
525
- "92414": {
526
- "content": "[UNUSED_TOKEN_17]",
527
- "lstrip": false,
528
- "normalized": false,
529
- "rstrip": false,
530
- "single_word": false,
531
- "special": false
532
- },
533
- "92415": {
534
- "content": "[UNUSED_TOKEN_18]",
535
- "lstrip": false,
536
- "normalized": false,
537
- "rstrip": false,
538
- "single_word": false,
539
- "special": false
540
- },
541
- "92416": {
542
- "content": "[UNUSED_TOKEN_19]",
543
- "lstrip": false,
544
- "normalized": false,
545
- "rstrip": false,
546
- "single_word": false,
547
- "special": false
548
- },
549
- "92417": {
550
- "content": "[UNUSED_TOKEN_20]",
551
- "lstrip": false,
552
- "normalized": false,
553
- "rstrip": false,
554
- "single_word": false,
555
- "special": false
556
- },
557
- "92418": {
558
- "content": "[UNUSED_TOKEN_21]",
559
- "lstrip": false,
560
- "normalized": false,
561
- "rstrip": false,
562
- "single_word": false,
563
- "special": false
564
- },
565
- "92419": {
566
- "content": "[UNUSED_TOKEN_22]",
567
- "lstrip": false,
568
- "normalized": false,
569
- "rstrip": false,
570
- "single_word": false,
571
- "special": false
572
- },
573
- "92420": {
574
- "content": "[UNUSED_TOKEN_23]",
575
- "lstrip": false,
576
- "normalized": false,
577
- "rstrip": false,
578
- "single_word": false,
579
- "special": false
580
- },
581
- "92421": {
582
- "content": "[UNUSED_TOKEN_24]",
583
- "lstrip": false,
584
- "normalized": false,
585
- "rstrip": false,
586
- "single_word": false,
587
- "special": false
588
- },
589
- "92422": {
590
- "content": "[UNUSED_TOKEN_25]",
591
- "lstrip": false,
592
- "normalized": false,
593
- "rstrip": false,
594
- "single_word": false,
595
- "special": false
596
- },
597
- "92423": {
598
- "content": "[UNUSED_TOKEN_26]",
599
- "lstrip": false,
600
- "normalized": false,
601
- "rstrip": false,
602
- "single_word": false,
603
- "special": false
604
- },
605
- "92424": {
606
- "content": "[UNUSED_TOKEN_27]",
607
- "lstrip": false,
608
- "normalized": false,
609
- "rstrip": false,
610
- "single_word": false,
611
- "special": false
612
- },
613
- "92425": {
614
- "content": "[UNUSED_TOKEN_28]",
615
- "lstrip": false,
616
- "normalized": false,
617
- "rstrip": false,
618
- "single_word": false,
619
- "special": false
620
- },
621
- "92426": {
622
- "content": "[UNUSED_TOKEN_29]",
623
- "lstrip": false,
624
- "normalized": false,
625
- "rstrip": false,
626
- "single_word": false,
627
- "special": false
628
- },
629
- "92427": {
630
- "content": "[UNUSED_TOKEN_30]",
631
- "lstrip": false,
632
- "normalized": false,
633
- "rstrip": false,
634
- "single_word": false,
635
- "special": false
636
- },
637
- "92428": {
638
- "content": "[UNUSED_TOKEN_31]",
639
- "lstrip": false,
640
- "normalized": false,
641
- "rstrip": false,
642
- "single_word": false,
643
- "special": false
644
- },
645
- "92429": {
646
- "content": "[UNUSED_TOKEN_32]",
647
- "lstrip": false,
648
- "normalized": false,
649
- "rstrip": false,
650
- "single_word": false,
651
- "special": false
652
- },
653
- "92430": {
654
- "content": "[UNUSED_TOKEN_33]",
655
- "lstrip": false,
656
- "normalized": false,
657
- "rstrip": false,
658
- "single_word": false,
659
- "special": false
660
- },
661
- "92431": {
662
- "content": "[UNUSED_TOKEN_34]",
663
- "lstrip": false,
664
- "normalized": false,
665
- "rstrip": false,
666
- "single_word": false,
667
- "special": false
668
- },
669
- "92432": {
670
- "content": "[UNUSED_TOKEN_35]",
671
- "lstrip": false,
672
- "normalized": false,
673
- "rstrip": false,
674
- "single_word": false,
675
- "special": false
676
- },
677
- "92433": {
678
- "content": "[UNUSED_TOKEN_36]",
679
- "lstrip": false,
680
- "normalized": false,
681
- "rstrip": false,
682
- "single_word": false,
683
- "special": false
684
- },
685
- "92434": {
686
- "content": "[UNUSED_TOKEN_37]",
687
- "lstrip": false,
688
- "normalized": false,
689
- "rstrip": false,
690
- "single_word": false,
691
- "special": false
692
- },
693
- "92435": {
694
- "content": "[UNUSED_TOKEN_38]",
695
- "lstrip": false,
696
- "normalized": false,
697
- "rstrip": false,
698
- "single_word": false,
699
- "special": false
700
- },
701
- "92436": {
702
- "content": "[UNUSED_TOKEN_39]",
703
- "lstrip": false,
704
- "normalized": false,
705
- "rstrip": false,
706
- "single_word": false,
707
- "special": false
708
- },
709
- "92437": {
710
- "content": "[UNUSED_TOKEN_40]",
711
- "lstrip": false,
712
- "normalized": false,
713
- "rstrip": false,
714
- "single_word": false,
715
- "special": false
716
- },
717
- "92438": {
718
- "content": "[UNUSED_TOKEN_41]",
719
- "lstrip": false,
720
- "normalized": false,
721
- "rstrip": false,
722
- "single_word": false,
723
- "special": false
724
- },
725
- "92439": {
726
- "content": "[UNUSED_TOKEN_42]",
727
- "lstrip": false,
728
- "normalized": false,
729
- "rstrip": false,
730
- "single_word": false,
731
- "special": false
732
- },
733
- "92440": {
734
- "content": "[UNUSED_TOKEN_43]",
735
- "lstrip": false,
736
- "normalized": false,
737
- "rstrip": false,
738
- "single_word": false,
739
- "special": false
740
- },
741
- "92441": {
742
- "content": "[UNUSED_TOKEN_44]",
743
- "lstrip": false,
744
- "normalized": false,
745
- "rstrip": false,
746
- "single_word": false,
747
- "special": false
748
- },
749
- "92442": {
750
- "content": "[UNUSED_TOKEN_45]",
751
- "lstrip": false,
752
- "normalized": false,
753
- "rstrip": false,
754
- "single_word": false,
755
- "special": false
756
- },
757
- "92443": {
758
- "content": "[UNUSED_TOKEN_46]",
759
- "lstrip": false,
760
- "normalized": false,
761
- "rstrip": false,
762
- "single_word": false,
763
- "special": false
764
- },
765
- "92444": {
766
- "content": "[UNUSED_TOKEN_47]",
767
- "lstrip": false,
768
- "normalized": false,
769
- "rstrip": false,
770
- "single_word": false,
771
- "special": false
772
- },
773
- "92445": {
774
- "content": "[UNUSED_TOKEN_48]",
775
- "lstrip": false,
776
- "normalized": false,
777
- "rstrip": false,
778
- "single_word": false,
779
- "special": false
780
- },
781
- "92446": {
782
- "content": "[UNUSED_TOKEN_49]",
783
- "lstrip": false,
784
- "normalized": false,
785
- "rstrip": false,
786
- "single_word": false,
787
- "special": false
788
- },
789
- "92447": {
790
- "content": "[UNUSED_TOKEN_50]",
791
- "lstrip": false,
792
- "normalized": false,
793
- "rstrip": false,
794
- "single_word": false,
795
- "special": false
796
- },
797
- "92448": {
798
- "content": "[UNUSED_TOKEN_51]",
799
- "lstrip": false,
800
- "normalized": false,
801
- "rstrip": false,
802
- "single_word": false,
803
- "special": false
804
- },
805
- "92449": {
806
- "content": "[UNUSED_TOKEN_52]",
807
- "lstrip": false,
808
- "normalized": false,
809
- "rstrip": false,
810
- "single_word": false,
811
- "special": false
812
- },
813
- "92450": {
814
- "content": "[UNUSED_TOKEN_53]",
815
- "lstrip": false,
816
- "normalized": false,
817
- "rstrip": false,
818
- "single_word": false,
819
- "special": false
820
- },
821
- "92451": {
822
- "content": "[UNUSED_TOKEN_54]",
823
- "lstrip": false,
824
- "normalized": false,
825
- "rstrip": false,
826
- "single_word": false,
827
- "special": false
828
- },
829
- "92452": {
830
- "content": "[UNUSED_TOKEN_55]",
831
- "lstrip": false,
832
- "normalized": false,
833
- "rstrip": false,
834
- "single_word": false,
835
- "special": false
836
- },
837
- "92453": {
838
- "content": "[UNUSED_TOKEN_56]",
839
- "lstrip": false,
840
- "normalized": false,
841
- "rstrip": false,
842
- "single_word": false,
843
- "special": false
844
- },
845
- "92454": {
846
- "content": "[UNUSED_TOKEN_57]",
847
- "lstrip": false,
848
- "normalized": false,
849
- "rstrip": false,
850
- "single_word": false,
851
- "special": false
852
- },
853
- "92455": {
854
- "content": "[UNUSED_TOKEN_58]",
855
- "lstrip": false,
856
- "normalized": false,
857
- "rstrip": false,
858
- "single_word": false,
859
- "special": false
860
- },
861
- "92456": {
862
- "content": "[UNUSED_TOKEN_59]",
863
- "lstrip": false,
864
- "normalized": false,
865
- "rstrip": false,
866
- "single_word": false,
867
- "special": false
868
- },
869
- "92457": {
870
- "content": "[UNUSED_TOKEN_60]",
871
- "lstrip": false,
872
- "normalized": false,
873
- "rstrip": false,
874
- "single_word": false,
875
- "special": false
876
- },
877
- "92458": {
878
- "content": "[UNUSED_TOKEN_61]",
879
- "lstrip": false,
880
- "normalized": false,
881
- "rstrip": false,
882
- "single_word": false,
883
- "special": false
884
- },
885
- "92459": {
886
- "content": "[UNUSED_TOKEN_62]",
887
- "lstrip": false,
888
- "normalized": false,
889
- "rstrip": false,
890
- "single_word": false,
891
- "special": false
892
- },
893
- "92460": {
894
- "content": "[UNUSED_TOKEN_63]",
895
- "lstrip": false,
896
- "normalized": false,
897
- "rstrip": false,
898
- "single_word": false,
899
- "special": false
900
- },
901
- "92461": {
902
- "content": "[UNUSED_TOKEN_64]",
903
- "lstrip": false,
904
- "normalized": false,
905
- "rstrip": false,
906
- "single_word": false,
907
- "special": false
908
- },
909
- "92462": {
910
- "content": "[UNUSED_TOKEN_65]",
911
- "lstrip": false,
912
- "normalized": false,
913
- "rstrip": false,
914
- "single_word": false,
915
- "special": false
916
- },
917
- "92463": {
918
- "content": "[UNUSED_TOKEN_66]",
919
- "lstrip": false,
920
- "normalized": false,
921
- "rstrip": false,
922
- "single_word": false,
923
- "special": false
924
- },
925
- "92464": {
926
- "content": "[UNUSED_TOKEN_67]",
927
- "lstrip": false,
928
- "normalized": false,
929
- "rstrip": false,
930
- "single_word": false,
931
- "special": false
932
- },
933
- "92465": {
934
- "content": "[UNUSED_TOKEN_68]",
935
- "lstrip": false,
936
- "normalized": false,
937
- "rstrip": false,
938
- "single_word": false,
939
- "special": false
940
- },
941
- "92466": {
942
- "content": "[UNUSED_TOKEN_69]",
943
- "lstrip": false,
944
- "normalized": false,
945
- "rstrip": false,
946
- "single_word": false,
947
- "special": false
948
- },
949
- "92467": {
950
- "content": "[UNUSED_TOKEN_70]",
951
- "lstrip": false,
952
- "normalized": false,
953
- "rstrip": false,
954
- "single_word": false,
955
- "special": false
956
- },
957
- "92468": {
958
- "content": "[UNUSED_TOKEN_71]",
959
- "lstrip": false,
960
- "normalized": false,
961
- "rstrip": false,
962
- "single_word": false,
963
- "special": false
964
- },
965
- "92469": {
966
- "content": "[UNUSED_TOKEN_72]",
967
- "lstrip": false,
968
- "normalized": false,
969
- "rstrip": false,
970
- "single_word": false,
971
- "special": false
972
- },
973
- "92470": {
974
- "content": "[UNUSED_TOKEN_73]",
975
- "lstrip": false,
976
- "normalized": false,
977
- "rstrip": false,
978
- "single_word": false,
979
- "special": false
980
- },
981
- "92471": {
982
- "content": "[UNUSED_TOKEN_74]",
983
- "lstrip": false,
984
- "normalized": false,
985
- "rstrip": false,
986
- "single_word": false,
987
- "special": false
988
- },
989
- "92472": {
990
- "content": "[UNUSED_TOKEN_75]",
991
- "lstrip": false,
992
- "normalized": false,
993
- "rstrip": false,
994
- "single_word": false,
995
- "special": false
996
- },
997
- "92473": {
998
- "content": "[UNUSED_TOKEN_76]",
999
- "lstrip": false,
1000
- "normalized": false,
1001
- "rstrip": false,
1002
- "single_word": false,
1003
- "special": false
1004
- },
1005
- "92474": {
1006
- "content": "[UNUSED_TOKEN_77]",
1007
- "lstrip": false,
1008
- "normalized": false,
1009
- "rstrip": false,
1010
- "single_word": false,
1011
- "special": false
1012
- },
1013
- "92475": {
1014
- "content": "[UNUSED_TOKEN_78]",
1015
- "lstrip": false,
1016
- "normalized": false,
1017
- "rstrip": false,
1018
- "single_word": false,
1019
- "special": false
1020
- },
1021
- "92476": {
1022
- "content": "[UNUSED_TOKEN_79]",
1023
- "lstrip": false,
1024
- "normalized": false,
1025
- "rstrip": false,
1026
- "single_word": false,
1027
- "special": false
1028
- },
1029
- "92477": {
1030
- "content": "[UNUSED_TOKEN_80]",
1031
- "lstrip": false,
1032
- "normalized": false,
1033
- "rstrip": false,
1034
- "single_word": false,
1035
- "special": false
1036
- },
1037
- "92478": {
1038
- "content": "[UNUSED_TOKEN_81]",
1039
- "lstrip": false,
1040
- "normalized": false,
1041
- "rstrip": false,
1042
- "single_word": false,
1043
- "special": false
1044
- },
1045
- "92479": {
1046
- "content": "[UNUSED_TOKEN_82]",
1047
- "lstrip": false,
1048
- "normalized": false,
1049
- "rstrip": false,
1050
- "single_word": false,
1051
- "special": false
1052
- },
1053
- "92480": {
1054
- "content": "[UNUSED_TOKEN_83]",
1055
- "lstrip": false,
1056
- "normalized": false,
1057
- "rstrip": false,
1058
- "single_word": false,
1059
- "special": false
1060
- },
1061
- "92481": {
1062
- "content": "[UNUSED_TOKEN_84]",
1063
- "lstrip": false,
1064
- "normalized": false,
1065
- "rstrip": false,
1066
- "single_word": false,
1067
- "special": false
1068
- },
1069
- "92482": {
1070
- "content": "[UNUSED_TOKEN_85]",
1071
- "lstrip": false,
1072
- "normalized": false,
1073
- "rstrip": false,
1074
- "single_word": false,
1075
- "special": false
1076
- },
1077
- "92483": {
1078
- "content": "[UNUSED_TOKEN_86]",
1079
- "lstrip": false,
1080
- "normalized": false,
1081
- "rstrip": false,
1082
- "single_word": false,
1083
- "special": false
1084
- },
1085
- "92484": {
1086
- "content": "[UNUSED_TOKEN_87]",
1087
- "lstrip": false,
1088
- "normalized": false,
1089
- "rstrip": false,
1090
- "single_word": false,
1091
- "special": false
1092
- },
1093
- "92485": {
1094
- "content": "[UNUSED_TOKEN_88]",
1095
- "lstrip": false,
1096
- "normalized": false,
1097
- "rstrip": false,
1098
- "single_word": false,
1099
- "special": false
1100
- },
1101
- "92486": {
1102
- "content": "[UNUSED_TOKEN_89]",
1103
- "lstrip": false,
1104
- "normalized": false,
1105
- "rstrip": false,
1106
- "single_word": false,
1107
- "special": false
1108
- },
1109
- "92487": {
1110
- "content": "[UNUSED_TOKEN_90]",
1111
- "lstrip": false,
1112
- "normalized": false,
1113
- "rstrip": false,
1114
- "single_word": false,
1115
- "special": false
1116
- },
1117
- "92488": {
1118
- "content": "[UNUSED_TOKEN_91]",
1119
- "lstrip": false,
1120
- "normalized": false,
1121
- "rstrip": false,
1122
- "single_word": false,
1123
- "special": false
1124
- },
1125
- "92489": {
1126
- "content": "[UNUSED_TOKEN_92]",
1127
- "lstrip": false,
1128
- "normalized": false,
1129
- "rstrip": false,
1130
- "single_word": false,
1131
- "special": false
1132
- },
1133
- "92490": {
1134
- "content": "[UNUSED_TOKEN_93]",
1135
- "lstrip": false,
1136
- "normalized": false,
1137
- "rstrip": false,
1138
- "single_word": false,
1139
- "special": false
1140
- },
1141
- "92491": {
1142
- "content": "[UNUSED_TOKEN_94]",
1143
- "lstrip": false,
1144
- "normalized": false,
1145
- "rstrip": false,
1146
- "single_word": false,
1147
- "special": false
1148
- },
1149
- "92492": {
1150
- "content": "[UNUSED_TOKEN_95]",
1151
- "lstrip": false,
1152
- "normalized": false,
1153
- "rstrip": false,
1154
- "single_word": false,
1155
- "special": false
1156
- },
1157
- "92493": {
1158
- "content": "[UNUSED_TOKEN_96]",
1159
- "lstrip": false,
1160
- "normalized": false,
1161
- "rstrip": false,
1162
- "single_word": false,
1163
- "special": false
1164
- },
1165
- "92494": {
1166
- "content": "[UNUSED_TOKEN_97]",
1167
- "lstrip": false,
1168
- "normalized": false,
1169
- "rstrip": false,
1170
- "single_word": false,
1171
- "special": false
1172
- },
1173
- "92495": {
1174
- "content": "[UNUSED_TOKEN_98]",
1175
- "lstrip": false,
1176
- "normalized": false,
1177
- "rstrip": false,
1178
- "single_word": false,
1179
- "special": false
1180
- },
1181
- "92496": {
1182
- "content": "[UNUSED_TOKEN_99]",
1183
- "lstrip": false,
1184
- "normalized": false,
1185
- "rstrip": false,
1186
- "single_word": false,
1187
- "special": false
1188
- },
1189
- "92497": {
1190
- "content": "[UNUSED_TOKEN_100]",
1191
- "lstrip": false,
1192
- "normalized": false,
1193
- "rstrip": false,
1194
- "single_word": false,
1195
- "special": false
1196
- },
1197
- "92498": {
1198
- "content": "[UNUSED_TOKEN_101]",
1199
- "lstrip": false,
1200
- "normalized": false,
1201
- "rstrip": false,
1202
- "single_word": false,
1203
- "special": false
1204
- },
1205
- "92499": {
1206
- "content": "[UNUSED_TOKEN_102]",
1207
- "lstrip": false,
1208
- "normalized": false,
1209
- "rstrip": false,
1210
- "single_word": false,
1211
- "special": false
1212
- },
1213
- "92500": {
1214
- "content": "[UNUSED_TOKEN_103]",
1215
- "lstrip": false,
1216
- "normalized": false,
1217
- "rstrip": false,
1218
- "single_word": false,
1219
- "special": false
1220
- },
1221
- "92501": {
1222
- "content": "[UNUSED_TOKEN_104]",
1223
- "lstrip": false,
1224
- "normalized": false,
1225
- "rstrip": false,
1226
- "single_word": false,
1227
- "special": false
1228
- },
1229
- "92502": {
1230
- "content": "[UNUSED_TOKEN_105]",
1231
- "lstrip": false,
1232
- "normalized": false,
1233
- "rstrip": false,
1234
- "single_word": false,
1235
- "special": false
1236
- },
1237
- "92503": {
1238
- "content": "[UNUSED_TOKEN_106]",
1239
- "lstrip": false,
1240
- "normalized": false,
1241
- "rstrip": false,
1242
- "single_word": false,
1243
- "special": false
1244
- },
1245
- "92504": {
1246
- "content": "[UNUSED_TOKEN_107]",
1247
- "lstrip": false,
1248
- "normalized": false,
1249
- "rstrip": false,
1250
- "single_word": false,
1251
- "special": false
1252
- },
1253
- "92505": {
1254
- "content": "[UNUSED_TOKEN_108]",
1255
- "lstrip": false,
1256
- "normalized": false,
1257
- "rstrip": false,
1258
- "single_word": false,
1259
- "special": false
1260
- },
1261
- "92506": {
1262
- "content": "[UNUSED_TOKEN_109]",
1263
- "lstrip": false,
1264
- "normalized": false,
1265
- "rstrip": false,
1266
- "single_word": false,
1267
- "special": false
1268
- },
1269
- "92507": {
1270
- "content": "[UNUSED_TOKEN_110]",
1271
- "lstrip": false,
1272
- "normalized": false,
1273
- "rstrip": false,
1274
- "single_word": false,
1275
- "special": false
1276
- },
1277
- "92508": {
1278
- "content": "[UNUSED_TOKEN_111]",
1279
- "lstrip": false,
1280
- "normalized": false,
1281
- "rstrip": false,
1282
- "single_word": false,
1283
- "special": false
1284
- },
1285
- "92509": {
1286
- "content": "[UNUSED_TOKEN_112]",
1287
- "lstrip": false,
1288
- "normalized": false,
1289
- "rstrip": false,
1290
- "single_word": false,
1291
- "special": false
1292
- },
1293
- "92510": {
1294
- "content": "[UNUSED_TOKEN_113]",
1295
- "lstrip": false,
1296
- "normalized": false,
1297
- "rstrip": false,
1298
- "single_word": false,
1299
- "special": false
1300
- },
1301
- "92511": {
1302
- "content": "[UNUSED_TOKEN_114]",
1303
- "lstrip": false,
1304
- "normalized": false,
1305
- "rstrip": false,
1306
- "single_word": false,
1307
- "special": false
1308
- },
1309
- "92512": {
1310
- "content": "[UNUSED_TOKEN_115]",
1311
- "lstrip": false,
1312
- "normalized": false,
1313
- "rstrip": false,
1314
- "single_word": false,
1315
- "special": false
1316
- },
1317
- "92513": {
1318
- "content": "[UNUSED_TOKEN_116]",
1319
- "lstrip": false,
1320
- "normalized": false,
1321
- "rstrip": false,
1322
- "single_word": false,
1323
- "special": false
1324
- },
1325
- "92514": {
1326
- "content": "[UNUSED_TOKEN_117]",
1327
- "lstrip": false,
1328
- "normalized": false,
1329
- "rstrip": false,
1330
- "single_word": false,
1331
- "special": false
1332
- },
1333
- "92515": {
1334
- "content": "[UNUSED_TOKEN_118]",
1335
- "lstrip": false,
1336
- "normalized": false,
1337
- "rstrip": false,
1338
- "single_word": false,
1339
- "special": false
1340
- },
1341
- "92516": {
1342
- "content": "[UNUSED_TOKEN_119]",
1343
- "lstrip": false,
1344
- "normalized": false,
1345
- "rstrip": false,
1346
- "single_word": false,
1347
- "special": false
1348
- },
1349
- "92517": {
1350
- "content": "[UNUSED_TOKEN_120]",
1351
- "lstrip": false,
1352
- "normalized": false,
1353
- "rstrip": false,
1354
- "single_word": false,
1355
- "special": false
1356
- },
1357
- "92518": {
1358
- "content": "[UNUSED_TOKEN_121]",
1359
- "lstrip": false,
1360
- "normalized": false,
1361
- "rstrip": false,
1362
- "single_word": false,
1363
- "special": false
1364
- },
1365
- "92519": {
1366
- "content": "[UNUSED_TOKEN_122]",
1367
- "lstrip": false,
1368
- "normalized": false,
1369
- "rstrip": false,
1370
- "single_word": false,
1371
- "special": false
1372
- },
1373
- "92520": {
1374
- "content": "[UNUSED_TOKEN_123]",
1375
- "lstrip": false,
1376
- "normalized": false,
1377
- "rstrip": false,
1378
- "single_word": false,
1379
- "special": false
1380
- },
1381
- "92521": {
1382
- "content": "[UNUSED_TOKEN_124]",
1383
- "lstrip": false,
1384
- "normalized": false,
1385
- "rstrip": false,
1386
- "single_word": false,
1387
- "special": false
1388
- },
1389
- "92522": {
1390
- "content": "[UNUSED_TOKEN_125]",
1391
- "lstrip": false,
1392
- "normalized": false,
1393
- "rstrip": false,
1394
- "single_word": false,
1395
- "special": false
1396
- },
1397
- "92523": {
1398
- "content": "[UNUSED_TOKEN_126]",
1399
- "lstrip": false,
1400
- "normalized": false,
1401
- "rstrip": false,
1402
- "single_word": false,
1403
- "special": false
1404
- },
1405
- "92524": {
1406
- "content": "[UNUSED_TOKEN_127]",
1407
- "lstrip": false,
1408
- "normalized": false,
1409
- "rstrip": false,
1410
- "single_word": false,
1411
- "special": false
1412
- },
1413
- "92525": {
1414
- "content": "[UNUSED_TOKEN_128]",
1415
- "lstrip": false,
1416
- "normalized": false,
1417
- "rstrip": false,
1418
- "single_word": false,
1419
- "special": false
1420
- },
1421
- "92526": {
1422
- "content": "[UNUSED_TOKEN_129]",
1423
- "lstrip": false,
1424
- "normalized": false,
1425
- "rstrip": false,
1426
- "single_word": false,
1427
- "special": false
1428
- },
1429
- "92527": {
1430
- "content": "[UNUSED_TOKEN_130]",
1431
- "lstrip": false,
1432
- "normalized": false,
1433
- "rstrip": false,
1434
- "single_word": false,
1435
- "special": false
1436
- },
1437
- "92528": {
1438
- "content": "[UNUSED_TOKEN_131]",
1439
- "lstrip": false,
1440
- "normalized": false,
1441
- "rstrip": false,
1442
- "single_word": false,
1443
- "special": false
1444
- },
1445
- "92529": {
1446
- "content": "[UNUSED_TOKEN_132]",
1447
- "lstrip": false,
1448
- "normalized": false,
1449
- "rstrip": false,
1450
- "single_word": false,
1451
- "special": false
1452
- },
1453
- "92530": {
1454
- "content": "[UNUSED_TOKEN_133]",
1455
- "lstrip": false,
1456
- "normalized": false,
1457
- "rstrip": false,
1458
- "single_word": false,
1459
- "special": false
1460
- },
1461
- "92531": {
1462
- "content": "[UNUSED_TOKEN_134]",
1463
- "lstrip": false,
1464
- "normalized": false,
1465
- "rstrip": false,
1466
- "single_word": false,
1467
- "special": false
1468
- },
1469
- "92532": {
1470
- "content": "[UNUSED_TOKEN_135]",
1471
- "lstrip": false,
1472
- "normalized": false,
1473
- "rstrip": false,
1474
- "single_word": false,
1475
- "special": false
1476
- },
1477
- "92533": {
1478
- "content": "[UNUSED_TOKEN_136]",
1479
- "lstrip": false,
1480
- "normalized": false,
1481
- "rstrip": false,
1482
- "single_word": false,
1483
- "special": false
1484
- },
1485
- "92534": {
1486
- "content": "[UNUSED_TOKEN_137]",
1487
- "lstrip": false,
1488
- "normalized": false,
1489
- "rstrip": false,
1490
- "single_word": false,
1491
- "special": false
1492
- },
1493
- "92535": {
1494
- "content": "[UNUSED_TOKEN_138]",
1495
- "lstrip": false,
1496
- "normalized": false,
1497
- "rstrip": false,
1498
- "single_word": false,
1499
- "special": false
1500
- },
1501
- "92536": {
1502
- "content": "[UNUSED_TOKEN_139]",
1503
- "lstrip": false,
1504
- "normalized": false,
1505
- "rstrip": false,
1506
- "single_word": false,
1507
- "special": false
1508
- },
1509
- "92537": {
1510
- "content": "[UNUSED_TOKEN_140]",
1511
- "lstrip": false,
1512
- "normalized": false,
1513
- "rstrip": false,
1514
- "single_word": false,
1515
- "special": false
1516
- },
1517
- "92538": {
1518
- "content": "<|plugin|>",
1519
- "lstrip": false,
1520
- "normalized": false,
1521
- "rstrip": false,
1522
- "single_word": false,
1523
- "special": true
1524
- },
1525
- "92539": {
1526
- "content": "<|interpreter|>",
1527
- "lstrip": false,
1528
- "normalized": false,
1529
- "rstrip": false,
1530
- "single_word": false,
1531
- "special": true
1532
- },
1533
- "92540": {
1534
- "content": "<|action_end|>",
1535
- "lstrip": false,
1536
- "normalized": false,
1537
- "rstrip": false,
1538
- "single_word": false,
1539
- "special": true
1540
- },
1541
- "92541": {
1542
- "content": "<|action_start|>",
1543
- "lstrip": false,
1544
- "normalized": false,
1545
- "rstrip": false,
1546
- "single_word": false,
1547
- "special": true
1548
- },
1549
- "92542": {
1550
- "content": "<|im_end|>",
1551
- "lstrip": false,
1552
- "normalized": false,
1553
- "rstrip": false,
1554
- "single_word": false,
1555
- "special": true
1556
- },
1557
- "92543": {
1558
- "content": "<|im_start|>",
1559
- "lstrip": false,
1560
- "normalized": false,
1561
- "rstrip": false,
1562
- "single_word": false,
1563
- "special": true
1564
- },
1565
- "92544": {
1566
- "content": "[UNUSED_TOKEN_141]",
1567
- "lstrip": false,
1568
- "normalized": false,
1569
- "rstrip": false,
1570
- "single_word": false,
1571
- "special": false
1572
- },
1573
- "92545": {
1574
- "content": "[UNUSED_TOKEN_142]",
1575
- "lstrip": false,
1576
- "normalized": false,
1577
- "rstrip": false,
1578
- "single_word": false,
1579
- "special": false
1580
- },
1581
- "92546": {
1582
- "content": "[UNUSED_TOKEN_143]",
1583
- "lstrip": false,
1584
- "normalized": false,
1585
- "rstrip": false,
1586
- "single_word": false,
1587
- "special": false
1588
- },
1589
- "92547": {
1590
- "content": "[UNUSED_TOKEN_144]",
1591
- "lstrip": false,
1592
- "normalized": false,
1593
- "rstrip": false,
1594
- "single_word": false,
1595
- "special": false
1596
- },
1597
- "92548": {
1598
- "content": "[UNUSED_TOKEN_145]",
1599
- "lstrip": false,
1600
- "normalized": false,
1601
- "rstrip": false,
1602
- "single_word": false,
1603
- "special": false
1604
- },
1605
- "92549": {
1606
- "content": "[UNUSED_TOKEN_146]",
1607
- "lstrip": false,
1608
- "normalized": false,
1609
- "rstrip": false,
1610
- "single_word": false,
1611
- "special": false
1612
- }
1613
- },
1614
- "additional_special_tokens": [
1615
- "<|im_start|>",
1616
- "<|im_end|>",
1617
- "<|action_start|>",
1618
- "<|action_end|>",
1619
- "<|interpreter|>",
1620
- "<|plugin|>"
1621
- ],
1622
- "auto_map": {
1623
- "AutoTokenizer": [
1624
- "tokenization_internlm2.InternLM2Tokenizer",
1625
- "tokenization_internlm2_fast.InternLM2TokenizerFast"
1626
- ]
1627
- },
1628
- "bos_token": "<s>",
1629
- "chat_template": "{{ '<s>' }}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|im_start|>system\n' + system_message + '<|im_end|>\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\n' + content + '<|im_end|>\n<|im_start|>assistant\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>\n' }}{% endif %}{% endfor %}",
1630
- "clean_up_tokenization_spaces": false,
1631
- "decode_with_prefix_space": false,
1632
- "eos_token": "</s>",
1633
- "model_max_length": 1000000000000000019884624838656,
1634
- "pad_token": "</s>",
1635
- "padding_side": "left",
1636
- "sp_model_kwargs": null,
1637
- "split_special_tokens": false,
1638
- "tokenizer_class": "InternLM2Tokenizer",
1639
- "unk_token": "<unk>"
1640
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/README.md ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ library_name: peft
4
+ tags:
5
+ - llama-factory
6
+ - lora
7
+ - generated_from_trainer
8
+ base_model: THUDM/glm-4-9b-chat-1m
9
+ metrics:
10
+ - accuracy
11
+ model-index:
12
+ - name: sft_bf16_p1_full
13
+ results: []
14
+ ---
15
+
16
+ <!-- This model card has been generated automatically according to the information the Trainer had access to. You
17
+ should probably proofread and complete it, then remove this comment. -->
18
+
19
+ # sft_bf16_p1_full
20
+
21
+ This model is a fine-tuned version of [THUDM/glm-4-9b-chat-1m](https://huggingface.co/THUDM/glm-4-9b-chat-1m) on the alpaca_mgtv_p1 dataset.
22
+ It achieves the following results on the evaluation set:
23
+ - Loss: 0.1995
24
+ - Accuracy: 0.9332
25
+
26
+ ## Model description
27
+
28
+ More information needed
29
+
30
+ ## Intended uses & limitations
31
+
32
+ More information needed
33
+
34
+ ## Training and evaluation data
35
+
36
+ More information needed
37
+
38
+ ## Training procedure
39
+
40
+ ### Training hyperparameters
41
+
42
+ The following hyperparameters were used during training:
43
+ - learning_rate: 0.0001
44
+ - train_batch_size: 16
45
+ - eval_batch_size: 1
46
+ - seed: 42
47
+ - gradient_accumulation_steps: 8
48
+ - total_train_batch_size: 128
49
+ - optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
50
+ - lr_scheduler_type: cosine
51
+ - lr_scheduler_warmup_ratio: 0.1
52
+ - num_epochs: 4.0
53
+
54
+ ### Training results
55
+
56
+ | Training Loss | Epoch | Step | Validation Loss | Accuracy |
57
+ |:-------------:|:------:|:----:|:---------------:|:--------:|
58
+ | 0.8958 | 0.9950 | 175 | 0.4473 | 0.7613 |
59
+ | 0.1917 | 1.9900 | 350 | 0.1856 | 0.9307 |
60
+ | 0.1287 | 2.9851 | 525 | 0.1813 | 0.9337 |
61
+ | 0.0755 | 3.9801 | 700 | 0.1995 | 0.9332 |
62
+
63
+
64
+ ### Framework versions
65
+
66
+ - PEFT 0.11.1
67
+ - Transformers 4.41.2
68
+ - Pytorch 2.2.1+cu121
69
+ - Datasets 2.19.1
70
+ - Tokenizers 0.19.1
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "THUDM/glm-4-9b-chat-1m",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "dense_h_to_4h",
24
+ "query_key_value",
25
+ "dense_4h_to_h",
26
+ "dense"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
llama-factory/{merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/model-00002-of-00008.safetensors → saves/glm-4-9b/lora/sft_bf16_p1_full/adapter_model.safetensors} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a8ebc89c15e39300cdb8876edace963f8a1b03c9092bae105970c1b1b7c7c92a
3
- size 1946242696
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:779d919e4e576eb536f72ff440fea92eb01a8b6522a276d586a48fc2f24d1fd2
3
+ size 85409560
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/added_tokens.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<eop>": 151334,
3
+ "<sop>": 151333,
4
+ "<|assistant|>": 151337,
5
+ "<|begin_of_image|>": 151339,
6
+ "<|begin_of_video|>": 151341,
7
+ "<|end_of_image|>": 151340,
8
+ "<|end_of_video|>": 151342,
9
+ "<|endoftext|>": 151329,
10
+ "<|observation|>": 151338,
11
+ "<|system|>": 151335,
12
+ "<|user|>": 151336,
13
+ "[MASK]": 151330,
14
+ "[gMASK]": 151331,
15
+ "[sMASK]": 151332
16
+ }
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/all_results.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 3.9800995024875623,
3
+ "eval_accuracy": 0.9332066666666665,
4
+ "eval_loss": 0.1994711458683014,
5
+ "eval_runtime": 135.9458,
6
+ "eval_samples_per_second": 18.39,
7
+ "eval_steps_per_second": 18.39,
8
+ "total_flos": 1.742929467193688e+18,
9
+ "train_loss": 0.2920300728934152,
10
+ "train_runtime": 10146.3014,
11
+ "train_samples_per_second": 8.87,
12
+ "train_steps_per_second": 0.069
13
+ }
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: THUDM/glm-4-9b-chat-1m
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.11.1
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "THUDM/glm-4-9b-chat-1m",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "dense_h_to_4h",
24
+ "query_key_value",
25
+ "dense_4h_to_h",
26
+ "dense"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
llama-factory/{merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/model-00003-of-00008.safetensors → saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350/adapter_model.safetensors} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f3f4271e5f6d4235465888481ec0c0b8ab1a8dcaea7d32c7c5517a3b0507bf47
3
- size 1979780440
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d0df7f0690c53c417ae34e98e48ad020e0a526362591869d7ba83f0c43ef8a4f
3
+ size 85409560
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350/added_tokens.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<eop>": 151334,
3
+ "<sop>": 151333,
4
+ "<|assistant|>": 151337,
5
+ "<|begin_of_image|>": 151339,
6
+ "<|begin_of_video|>": 151341,
7
+ "<|end_of_image|>": 151340,
8
+ "<|end_of_video|>": 151342,
9
+ "<|endoftext|>": 151329,
10
+ "<|observation|>": 151338,
11
+ "<|system|>": 151335,
12
+ "<|user|>": 151336,
13
+ "[MASK]": 151330,
14
+ "[gMASK]": 151331,
15
+ "[sMASK]": 151332
16
+ }
llama-factory/{merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full/model-00004-of-00008.safetensors → saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350/optimizer.pt} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:25a4f0969df00963d23269ce06fb4ceff910e5fe8871bb0e8a387ca3be449fb7
3
- size 1946242728
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c5d56ac96c6f89282175acfa3642c1ef58146678d7473bc11de09f140e75f2a3
3
+ size 170990330
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8d138cfe3a4adf21f048848ee35837c9a757a0a3616ff7adbb45b69aac247435
3
+ size 14244
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:260dbe7a290a3eb6243acf2b7854e91fda974ddc05a7f2e6ef4d24d9dbbc4233
3
+ size 1064
llama-factory/{merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full → saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350}/special_tokens_map.json RENAMED
@@ -1,35 +1,29 @@
1
  {
2
  "additional_special_tokens": [
3
- "<|im_start|>",
4
- "<|im_end|>",
5
- "<|action_start|>",
6
- "<|action_end|>",
7
- "<|interpreter|>",
8
- "<|plugin|>"
 
 
 
 
 
 
 
 
9
  ],
10
- "bos_token": {
11
- "content": "<s>",
12
- "lstrip": false,
13
- "normalized": false,
14
- "rstrip": false,
15
- "single_word": false
16
- },
17
  "eos_token": {
18
- "content": "</s>",
19
  "lstrip": false,
20
  "normalized": false,
21
  "rstrip": false,
22
  "single_word": false
23
  },
24
  "pad_token": {
25
- "content": "</s>",
26
- "lstrip": false,
27
- "normalized": false,
28
- "rstrip": false,
29
- "single_word": false
30
- },
31
- "unk_token": {
32
- "content": "<unk>",
33
  "lstrip": false,
34
  "normalized": false,
35
  "rstrip": false,
 
1
  {
2
  "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "[MASK]",
5
+ "[gMASK]",
6
+ "[sMASK]",
7
+ "<sop>",
8
+ "<eop>",
9
+ "<|system|>",
10
+ "<|user|>",
11
+ "<|assistant|>",
12
+ "<|observation|>",
13
+ "<|begin_of_image|>",
14
+ "<|end_of_image|>",
15
+ "<|begin_of_video|>",
16
+ "<|end_of_video|>"
17
  ],
 
 
 
 
 
 
 
18
  "eos_token": {
19
+ "content": "<|endoftext|>",
20
  "lstrip": false,
21
  "normalized": false,
22
  "rstrip": false,
23
  "single_word": false
24
  },
25
  "pad_token": {
26
+ "content": "<|endoftext|>",
 
 
 
 
 
 
 
27
  "lstrip": false,
28
  "normalized": false,
29
  "rstrip": false,
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350/tokenization_chatglm.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import regex as re
2
+ import base64
3
+ import os
4
+ import json
5
+ import tiktoken
6
+ from torch import TensorType
7
+ from typing import List, Optional, Union, Dict, Any
8
+ from transformers import PreTrainedTokenizer
9
+ from transformers.utils import logging, PaddingStrategy
10
+ from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
11
+
12
+
13
+ class ChatGLM4Tokenizer(PreTrainedTokenizer):
14
+ vocab_files_names = {"vocab_file": "tokenizer.model"}
15
+ model_input_names = ["input_ids", "attention_mask", "position_ids"]
16
+
17
+ def __init__(
18
+ self,
19
+ vocab_file,
20
+ padding_side="left",
21
+ clean_up_tokenization_spaces=False,
22
+ encode_special_tokens=False,
23
+ **kwargs
24
+ ):
25
+ self.name = "GLM4Tokenizer"
26
+ self.vocab_file = vocab_file
27
+ pat_str = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
28
+ self.pat_str = re.compile(pat_str)
29
+ self.encode_special_tokens = encode_special_tokens
30
+
31
+ mergeable_ranks = {}
32
+ with open(vocab_file) as f:
33
+ for line in f:
34
+ token, rank = line.strip().split()
35
+ rank = int(rank)
36
+ token = base64.b64decode(token)
37
+ mergeable_ranks[token] = rank
38
+
39
+ self.mergeable_ranks = mergeable_ranks
40
+
41
+ self.tokenizer = tiktoken.Encoding(
42
+ name="my_tokenizer",
43
+ pat_str=pat_str,
44
+ mergeable_ranks=mergeable_ranks,
45
+ special_tokens={}
46
+ )
47
+ self.decoder = {rank: token for token, rank in mergeable_ranks.items()}
48
+ self.n_words = len(self.decoder)
49
+
50
+ super().__init__(
51
+ padding_side=padding_side,
52
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
53
+ **kwargs
54
+ )
55
+
56
+ @property
57
+ def vocab_size(self):
58
+ return self.n_words
59
+
60
+ def get_vocab(self):
61
+ """ Returns vocab as a dict """
62
+ vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
63
+ vocab.update(self.added_tokens_encoder)
64
+ return vocab
65
+
66
+ def convert_tokens_to_string(self, tokens: List[Union[bytes, str, int]]) -> str:
67
+ """
68
+ Converts a sequence of tokens in a single string.
69
+ """
70
+ text = ""
71
+ temp = b""
72
+ for t in tokens:
73
+ if isinstance(t, int):
74
+ t = chr(t)
75
+ if isinstance(t, str):
76
+ if temp:
77
+ text += temp.decode("utf-8", errors="replace")
78
+ elif isinstance(t, bytes):
79
+ temp += t
80
+ else:
81
+ raise TypeError("token should only be of type int, bytes or str")
82
+ if temp:
83
+ text += temp.decode("utf-8", errors="replace")
84
+ return text
85
+
86
+ def _tokenize(self, text, **kwargs):
87
+ tokens = []
88
+ ids = self.tokenizer.encode(text)
89
+ for t in ids:
90
+ tokens.append(self.decoder[t])
91
+ return tokens
92
+
93
+ def _convert_token_to_id(self, token):
94
+ """ Converts a token (str) in an id using the vocab. """
95
+ return self.mergeable_ranks[token]
96
+
97
+ def _convert_id_to_token(self, index):
98
+ """Converts an index (integer) in a token (str) using the vocab."""
99
+ return self.decoder.get(index, "")
100
+
101
+ def save_vocabulary(self, save_directory, filename_prefix=None):
102
+ """
103
+ Save the vocabulary and special tokens file to a directory.
104
+
105
+ Args:
106
+ save_directory (`str`):
107
+ The directory in which to save the vocabulary.
108
+ filename_prefix (`str`, *optional*):
109
+ An optional prefix to add to the named of the saved files.
110
+
111
+ Returns:
112
+ `Tuple(str)`: Paths to the files saved.
113
+ """
114
+ if os.path.isdir(save_directory):
115
+ vocab_file = os.path.join(
116
+ save_directory, self.vocab_files_names["vocab_file"]
117
+ )
118
+ else:
119
+ vocab_file = save_directory
120
+
121
+ with open(self.vocab_file, 'rb') as fin:
122
+ proto_str = fin.read()
123
+
124
+ with open(vocab_file, "wb") as writer:
125
+ writer.write(proto_str)
126
+
127
+ return (vocab_file,)
128
+
129
+ def get_prefix_tokens(self):
130
+ prefix_tokens = [self.convert_tokens_to_ids("[gMASK]"), self.convert_tokens_to_ids("<sop>")]
131
+ return prefix_tokens
132
+
133
+ def build_single_message(self, role, metadata, message, tokenize=True):
134
+ assert role in ["system", "user", "assistant", "observation"], role
135
+ if tokenize:
136
+ role_tokens = [self.convert_tokens_to_ids(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n",
137
+ disallowed_special=())
138
+ message_tokens = self.tokenizer.encode(message, disallowed_special=())
139
+ tokens = role_tokens + message_tokens
140
+ return tokens
141
+ else:
142
+ return str(f"<|{role}|>{metadata}\n{message}")
143
+
144
+ # Use Jinja Template in tokenizer_config.json
145
+ # def apply_chat_template(
146
+ # self,
147
+ # conversation: Union[List[Dict[str, str]], List[List[Dict[str, str]]], "Conversation"],
148
+ # add_generation_prompt: bool = False,
149
+ # tokenize: bool = True,
150
+ # padding: bool = False,
151
+ # truncation: bool = False,
152
+ # max_length: Optional[int] = None,
153
+ # return_tensors: Optional[Union[str, TensorType]] = None,
154
+ # return_dict: bool = False,
155
+ # tokenizer_kwargs: Optional[Dict[str, Any]] = None,
156
+ # add_special_tokens: bool = True,
157
+ # **kwargs,
158
+ # ) -> Union[str, List[int], List[str], List[List[int]], BatchEncoding]:
159
+ #
160
+ # if return_dict and not tokenize:
161
+ # raise ValueError(
162
+ # "`return_dict=True` is incompatible with `tokenize=False`, because there is no dict "
163
+ # "of tokenizer outputs to return."
164
+ # )
165
+ #
166
+ # def handle_single_conversation(conversation):
167
+ # input_ids = self.get_prefix_tokens() if add_special_tokens else []
168
+ # input_message = "[gMASK]<sop>" if add_special_tokens else ""
169
+ # for item in conversation:
170
+ # if item.get("tools"):
171
+ # tools = item["tools"]
172
+ # content = "你是一个名为 GhatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的,你的任务是针对用户的问题和要求提供适当的答复和支持。"
173
+ # content += "\n\n# 可用工具"
174
+ # for tool in tools:
175
+ # if tool["type"] == "function":
176
+ # function = tool["function"]
177
+ # content += f"\n\n## {function['name']}\n\n{json.dumps(function, ensure_ascii=False, indent=4)}"
178
+ # content += "\n在调用上述函数时,请使用 Json 格式表示调用的参数。"
179
+ # elif tool["type"] == "python":
180
+ # content += "\n\n## python\n\n当你向 `python` 发送包含 Python 代码的消息时,该代码将会在一个有状态的 Jupyter notebook 环境中执行。\n`python` 返回代码执行的输出,或在执行 60 秒后返回超时。\n`/mnt/data` 将会持久化存储你的文件。在此会话中,`python` 无法访问互联网。不要使用 `python` 进行任何网络请求或者在线 API 调用,这些在线内容的访问将不会成功。"
181
+ # elif tool["type"] == "simple_browser":
182
+ # content += "\n\n## simple_browser\n\n你可以使用 `simple_browser` 工具。该工具支持以下函数:\n`search(query: str, recency_days: int)`:使用搜索引擎进行查询并显示结果,可以使用 `recency_days` 参数控制搜索内容的时效性。\n`mclick(ids: list[int])`:获取一系列指定 id 的页面内容。每次调用时,须选择3-10个页面。选择多个角度的页面,同时尽可能选择可信任的信息来源。考虑到部分页面是无法加载的,你也可以多打开一些可能有用的页面而不用担心内容过多。\n`open_url(url: str)`:打开指定的 URL。\n\n使用 `【{引用 id}†{引用文本}】` 来引用内容。\n\n操作步骤:1. 使用 `search` 来获得信息列表; 2. 使用 `mclick` 来获取指定 ID 页面的内容; 3. 根据获得的内容进行回复。在回复中应当引用信息来源。\n 如果用户提供了 URL,也可以用 `open_url` 直接打开页面。\n如果初次搜索结果没有找到合适的信息,也可以再次使用 `search` 进行搜索。"
183
+ # elif tool["type"] == "cogview":
184
+ # content += "\n\n## cogview\n\n如果用户的请求中包含了对图像的描述,你可以使用 `cogview` 来生成图像并展示给用户。你需要向 `cogview` 发送图像描述,规则:\n- 发送给 `cogview` 的消息必须使用英语。用户的中文描述必须完全翻译为英语。\n- 应当尽可能详细地描述图像生成的需求,需求描述约 100 英文单词。\n- 保持用户原始描述的意图。不要虚构内容或者没见过的人物。\n- 如无特殊说明,所在地为中国,持有中国立场并遵循中国社会主义价值观。"
185
+ # else:
186
+ # raise NotImplementedError(f"Unknown tool type {tool['type']}")
187
+ # input = self.build_single_message("system", "", content, tokenize=tokenize)
188
+ # if tokenize:
189
+ # input_ids.extend(input)
190
+ # else:
191
+ # input_message += input
192
+ # if item["content"]:
193
+ # input = self.build_single_message(
194
+ # item["role"],
195
+ # item.get("metadata", ""),
196
+ # item["content"],
197
+ # tokenize=tokenize
198
+ # )
199
+ # if tokenize:
200
+ # input_ids.extend(input)
201
+ # else:
202
+ # input_message += input
203
+ # if add_generation_prompt:
204
+ # if tokenize:
205
+ # input_ids.extend([self.convert_tokens_to_ids("<|assistant|>")])
206
+ # else:
207
+ # input_message += "<|assistant|>"
208
+ # return input_ids if tokenize else input_message
209
+ #
210
+ # # Main logic to handle different conversation formats
211
+ # if isinstance(conversation, list) and all(isinstance(i, dict) for i in conversation):
212
+ # result = handle_single_conversation(conversation)
213
+ # elif isinstance(conversation, list) and all(isinstance(i, list) for i in conversation):
214
+ # result = [handle_single_conversation(c) for c in conversation]
215
+ # elif hasattr(conversation, "messages"):
216
+ # result = handle_single_conversation(conversation.messages)
217
+ # else:
218
+ # raise ValueError("Invalid conversation format")
219
+ #
220
+ # if tokenize:
221
+ # output = self.batch_encode_plus(
222
+ # [result] if isinstance(result[0], int) else result,
223
+ # padding=padding,
224
+ # truncation=truncation,
225
+ # max_length=max_length,
226
+ # return_tensors=return_tensors,
227
+ # is_split_into_words=True,
228
+ # add_special_tokens=False
229
+ # )
230
+ # if return_dict:
231
+ # return output
232
+ # else:
233
+ # return output["input_ids"]
234
+ # else:
235
+ # return result
236
+
237
+ def build_inputs_with_special_tokens(
238
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
239
+ ) -> List[int]:
240
+ """
241
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
242
+ adding special tokens. A BERT sequence has the following format:
243
+
244
+ - single sequence: `[CLS] X [SEP]`
245
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
246
+
247
+ Args:
248
+ token_ids_0 (`List[int]`):
249
+ List of IDs to which the special tokens will be added.
250
+ token_ids_1 (`List[int]`, *optional*):
251
+ Optional second list of IDs for sequence pairs.
252
+
253
+ Returns:
254
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
255
+ """
256
+ prefix_tokens = self.get_prefix_tokens()
257
+ token_ids_0 = prefix_tokens + token_ids_0
258
+ if token_ids_1 is not None:
259
+ token_ids_0 = token_ids_0 + token_ids_1 + [self.convert_tokens_to_ids("<eos>")]
260
+ return token_ids_0
261
+
262
+ def _pad(
263
+ self,
264
+ encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
265
+ max_length: Optional[int] = None,
266
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
267
+ pad_to_multiple_of: Optional[int] = None,
268
+ return_attention_mask: Optional[bool] = None,
269
+ ) -> dict:
270
+ """
271
+ Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
272
+
273
+ Args:
274
+ encoded_inputs:
275
+ Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
276
+ max_length: maximum length of the returned list and optionally padding length (see below).
277
+ Will truncate by taking into account the special tokens.
278
+ padding_strategy: PaddingStrategy to use for padding.
279
+
280
+ - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
281
+ - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
282
+ - PaddingStrategy.DO_NOT_PAD: Do not pad
283
+ The tokenizer padding sides are defined in self.padding_side:
284
+
285
+ - 'left': pads on the left of the sequences
286
+ - 'right': pads on the right of the sequences
287
+ pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
288
+ This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
289
+ `>= 7.5` (Volta).
290
+ return_attention_mask:
291
+ (optional) Set to False to avoid returning attention mask (default: set to model specifics)
292
+ """
293
+ # Load from model defaults
294
+ assert self.padding_side == "left"
295
+
296
+ required_input = encoded_inputs[self.model_input_names[0]]
297
+ seq_length = len(required_input)
298
+
299
+ if padding_strategy == PaddingStrategy.LONGEST:
300
+ max_length = len(required_input)
301
+
302
+ if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
303
+ max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
304
+
305
+ needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
306
+
307
+ # Initialize attention mask if not present.
308
+ if "attention_mask" not in encoded_inputs:
309
+ encoded_inputs["attention_mask"] = [1] * seq_length
310
+
311
+ if "position_ids" not in encoded_inputs:
312
+ encoded_inputs["position_ids"] = list(range(seq_length))
313
+
314
+ if needs_to_be_padded:
315
+ difference = max_length - len(required_input)
316
+
317
+ if "attention_mask" in encoded_inputs:
318
+ encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
319
+ if "position_ids" in encoded_inputs:
320
+ encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
321
+ encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
322
+
323
+ return encoded_inputs
llama-factory/{merged_models/internlm2_5-7b-chat-1m_sft_bf16_p2_full → saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350}/tokenizer.model RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f868398fc4e05ee1e8aeba95ddf18ddcc45b8bce55d5093bead5bbf80429b48b
3
- size 1477754
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a493598071550244b2ee7f26118f3edec2150b9dfa967929a99052ac83fe716
3
+ size 2623634
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350/tokenizer_config.json ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "151329": {
4
+ "content": "<|endoftext|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "151330": {
12
+ "content": "[MASK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "151331": {
20
+ "content": "[gMASK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "151332": {
28
+ "content": "[sMASK]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "151333": {
36
+ "content": "<sop>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "151334": {
44
+ "content": "<eop>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "151335": {
52
+ "content": "<|system|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "151336": {
60
+ "content": "<|user|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "151337": {
68
+ "content": "<|assistant|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "151338": {
76
+ "content": "<|observation|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "151339": {
84
+ "content": "<|begin_of_image|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "151340": {
92
+ "content": "<|end_of_image|>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "151341": {
100
+ "content": "<|begin_of_video|>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "151342": {
108
+ "content": "<|end_of_video|>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ }
115
+ },
116
+ "additional_special_tokens": [
117
+ "<|endoftext|>",
118
+ "[MASK]",
119
+ "[gMASK]",
120
+ "[sMASK]",
121
+ "<sop>",
122
+ "<eop>",
123
+ "<|system|>",
124
+ "<|user|>",
125
+ "<|assistant|>",
126
+ "<|observation|>",
127
+ "<|begin_of_image|>",
128
+ "<|end_of_image|>",
129
+ "<|begin_of_video|>",
130
+ "<|end_of_video|>"
131
+ ],
132
+ "auto_map": {
133
+ "AutoTokenizer": [
134
+ "tokenization_chatglm.ChatGLM4Tokenizer",
135
+ null
136
+ ]
137
+ },
138
+ "chat_template": "{{ '[gMASK]<sop>' }}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|system|>\n' + system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + content + '<|assistant|>' }}{% elif message['role'] == 'assistant' %}{{ '\n' + content }}{% endif %}{% endfor %}",
139
+ "clean_up_tokenization_spaces": false,
140
+ "do_lower_case": false,
141
+ "eos_token": "<|endoftext|>",
142
+ "model_max_length": 1024000,
143
+ "pad_token": "<|endoftext|>",
144
+ "padding_side": "right",
145
+ "remove_space": false,
146
+ "split_special_tokens": false,
147
+ "tokenizer_class": "ChatGLM4Tokenizer"
148
+ }
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350/trainer_state.json ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 1.9900497512437811,
5
+ "eval_steps": 175,
6
+ "global_step": 350,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.05685856432125089,
13
+ "grad_norm": 2.919694662094116,
14
+ "learning_rate": 1.4285714285714285e-05,
15
+ "loss": 3.8009,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.11371712864250177,
20
+ "grad_norm": 3.130059003829956,
21
+ "learning_rate": 2.857142857142857e-05,
22
+ "loss": 0.3289,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.17057569296375266,
27
+ "grad_norm": 1.6621949672698975,
28
+ "learning_rate": 4.2857142857142856e-05,
29
+ "loss": 0.2598,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.22743425728500355,
34
+ "grad_norm": 2.371370792388916,
35
+ "learning_rate": 5.714285714285714e-05,
36
+ "loss": 0.2401,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.28429282160625446,
41
+ "grad_norm": 0.8625539541244507,
42
+ "learning_rate": 7.142857142857143e-05,
43
+ "loss": 0.2306,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.3411513859275053,
48
+ "grad_norm": 5.281027793884277,
49
+ "learning_rate": 8.571428571428571e-05,
50
+ "loss": 0.2463,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.39800995024875624,
55
+ "grad_norm": 1.4231479167938232,
56
+ "learning_rate": 0.0001,
57
+ "loss": 0.2291,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.4548685145700071,
62
+ "grad_norm": 1.9341018199920654,
63
+ "learning_rate": 9.993784606094612e-05,
64
+ "loss": 0.2301,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.511727078891258,
69
+ "grad_norm": 2.0011138916015625,
70
+ "learning_rate": 9.975153876827008e-05,
71
+ "loss": 0.2207,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.5685856432125089,
76
+ "grad_norm": 0.6652698516845703,
77
+ "learning_rate": 9.944154131125642e-05,
78
+ "loss": 0.2271,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.6254442075337597,
83
+ "grad_norm": 1.3393596410751343,
84
+ "learning_rate": 9.900862439242719e-05,
85
+ "loss": 0.216,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.6823027718550106,
90
+ "grad_norm": 5.120615482330322,
91
+ "learning_rate": 9.84538643114539e-05,
92
+ "loss": 0.2311,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.7391613361762616,
97
+ "grad_norm": 1.72005033493042,
98
+ "learning_rate": 9.777864028930705e-05,
99
+ "loss": 0.212,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.7960199004975125,
104
+ "grad_norm": 1.5774250030517578,
105
+ "learning_rate": 9.698463103929542e-05,
106
+ "loss": 0.2261,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.8528784648187633,
111
+ "grad_norm": 93.82937622070312,
112
+ "learning_rate": 9.607381059352038e-05,
113
+ "loss": 2.368,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.9097370291400142,
118
+ "grad_norm": 11.905146598815918,
119
+ "learning_rate": 9.504844339512095e-05,
120
+ "loss": 2.4337,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.9665955934612651,
125
+ "grad_norm": 4.584591388702393,
126
+ "learning_rate": 9.391107866851143e-05,
127
+ "loss": 0.8958,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.9950248756218906,
132
+ "eval_accuracy": 0.7612800000000001,
133
+ "eval_loss": 0.4472738206386566,
134
+ "eval_runtime": 136.3095,
135
+ "eval_samples_per_second": 18.341,
136
+ "eval_steps_per_second": 18.341,
137
+ "step": 175
138
+ },
139
+ {
140
+ "epoch": 1.023454157782516,
141
+ "grad_norm": 4.700326919555664,
142
+ "learning_rate": 9.266454408160779e-05,
143
+ "loss": 0.4271,
144
+ "step": 180
145
+ },
146
+ {
147
+ "epoch": 1.080312722103767,
148
+ "grad_norm": 1.9691985845565796,
149
+ "learning_rate": 9.131193871579975e-05,
150
+ "loss": 0.2324,
151
+ "step": 190
152
+ },
153
+ {
154
+ "epoch": 1.1371712864250179,
155
+ "grad_norm": 2.0088887214660645,
156
+ "learning_rate": 8.985662536114613e-05,
157
+ "loss": 0.1996,
158
+ "step": 200
159
+ },
160
+ {
161
+ "epoch": 1.1940298507462686,
162
+ "grad_norm": 6.571439266204834,
163
+ "learning_rate": 8.83022221559489e-05,
164
+ "loss": 0.1961,
165
+ "step": 210
166
+ },
167
+ {
168
+ "epoch": 1.2508884150675195,
169
+ "grad_norm": 3.290731191635132,
170
+ "learning_rate": 8.665259359149132e-05,
171
+ "loss": 0.1942,
172
+ "step": 220
173
+ },
174
+ {
175
+ "epoch": 1.3077469793887704,
176
+ "grad_norm": 1.540110468864441,
177
+ "learning_rate": 8.491184090430364e-05,
178
+ "loss": 0.2152,
179
+ "step": 230
180
+ },
181
+ {
182
+ "epoch": 1.3646055437100213,
183
+ "grad_norm": 3.2633206844329834,
184
+ "learning_rate": 8.308429187984297e-05,
185
+ "loss": 0.1986,
186
+ "step": 240
187
+ },
188
+ {
189
+ "epoch": 1.4214641080312722,
190
+ "grad_norm": 2.2755699157714844,
191
+ "learning_rate": 8.117449009293668e-05,
192
+ "loss": 0.209,
193
+ "step": 250
194
+ },
195
+ {
196
+ "epoch": 1.4783226723525231,
197
+ "grad_norm": 1.3403345346450806,
198
+ "learning_rate": 7.91871836117395e-05,
199
+ "loss": 0.191,
200
+ "step": 260
201
+ },
202
+ {
203
+ "epoch": 1.535181236673774,
204
+ "grad_norm": 2.5068321228027344,
205
+ "learning_rate": 7.712731319328798e-05,
206
+ "loss": 0.2046,
207
+ "step": 270
208
+ },
209
+ {
210
+ "epoch": 1.5920398009950247,
211
+ "grad_norm": 1.23170804977417,
212
+ "learning_rate": 7.500000000000001e-05,
213
+ "loss": 0.1939,
214
+ "step": 280
215
+ },
216
+ {
217
+ "epoch": 1.6488983653162759,
218
+ "grad_norm": 1.3532381057739258,
219
+ "learning_rate": 7.281053286765815e-05,
220
+ "loss": 0.2072,
221
+ "step": 290
222
+ },
223
+ {
224
+ "epoch": 1.7057569296375266,
225
+ "grad_norm": 1.6122020483016968,
226
+ "learning_rate": 7.056435515653059e-05,
227
+ "loss": 0.1934,
228
+ "step": 300
229
+ },
230
+ {
231
+ "epoch": 1.7626154939587777,
232
+ "grad_norm": 1.4875357151031494,
233
+ "learning_rate": 6.826705121831976e-05,
234
+ "loss": 0.1782,
235
+ "step": 310
236
+ },
237
+ {
238
+ "epoch": 1.8194740582800284,
239
+ "grad_norm": 1.375095248222351,
240
+ "learning_rate": 6.592433251258423e-05,
241
+ "loss": 0.1879,
242
+ "step": 320
243
+ },
244
+ {
245
+ "epoch": 1.8763326226012793,
246
+ "grad_norm": 1.3681703805923462,
247
+ "learning_rate": 6.354202340715026e-05,
248
+ "loss": 0.1862,
249
+ "step": 330
250
+ },
251
+ {
252
+ "epoch": 1.9331911869225302,
253
+ "grad_norm": 1.6180658340454102,
254
+ "learning_rate": 6.112604669781572e-05,
255
+ "loss": 0.2014,
256
+ "step": 340
257
+ },
258
+ {
259
+ "epoch": 1.9900497512437811,
260
+ "grad_norm": 1.4774919748306274,
261
+ "learning_rate": 5.868240888334653e-05,
262
+ "loss": 0.1917,
263
+ "step": 350
264
+ },
265
+ {
266
+ "epoch": 1.9900497512437811,
267
+ "eval_accuracy": 0.9307333333333332,
268
+ "eval_loss": 0.18561618030071259,
269
+ "eval_runtime": 136.2276,
270
+ "eval_samples_per_second": 18.352,
271
+ "eval_steps_per_second": 18.352,
272
+ "step": 350
273
+ }
274
+ ],
275
+ "logging_steps": 10,
276
+ "max_steps": 700,
277
+ "num_input_tokens_seen": 0,
278
+ "num_train_epochs": 4,
279
+ "save_steps": 175,
280
+ "stateful_callbacks": {
281
+ "TrainerControl": {
282
+ "args": {
283
+ "should_epoch_stop": false,
284
+ "should_evaluate": false,
285
+ "should_log": false,
286
+ "should_save": true,
287
+ "should_training_stop": false
288
+ },
289
+ "attributes": {}
290
+ }
291
+ },
292
+ "total_flos": 8.714468225636106e+17,
293
+ "train_batch_size": 16,
294
+ "trial_name": null,
295
+ "trial_params": null
296
+ }
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-350/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9e6f3241dabbb8a63b52024fb0fb0d68c85f4b48b07e8579ebd8f41fe5fd662
3
+ size 5304
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: THUDM/glm-4-9b-chat-1m
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.11.1
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "THUDM/glm-4-9b-chat-1m",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "dense_h_to_4h",
24
+ "query_key_value",
25
+ "dense_4h_to_h",
26
+ "dense"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8950e9830e197a7ba9feb5d7846eab5774492a29cc003e12509fb0a46fda573
3
+ size 85409560
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/added_tokens.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<eop>": 151334,
3
+ "<sop>": 151333,
4
+ "<|assistant|>": 151337,
5
+ "<|begin_of_image|>": 151339,
6
+ "<|begin_of_video|>": 151341,
7
+ "<|end_of_image|>": 151340,
8
+ "<|end_of_video|>": 151342,
9
+ "<|endoftext|>": 151329,
10
+ "<|observation|>": 151338,
11
+ "<|system|>": 151335,
12
+ "<|user|>": 151336,
13
+ "[MASK]": 151330,
14
+ "[gMASK]": 151331,
15
+ "[sMASK]": 151332
16
+ }
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:492bcdefdfbfd8576f444c520cde808f197cf0229e536c2d5834485e478baf8a
3
+ size 170990330
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c062f7f375beded48b5337f5a3f3a5cb38807fa3e85dbf3e294c0ab6b627bfc2
3
+ size 14244
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eed90e588b5c04a63000dc8b8376b1e11f37980f3dd5d73e4c2a4b71a995cf3a
3
+ size 1064
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/special_tokens_map.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|endoftext|>",
4
+ "[MASK]",
5
+ "[gMASK]",
6
+ "[sMASK]",
7
+ "<sop>",
8
+ "<eop>",
9
+ "<|system|>",
10
+ "<|user|>",
11
+ "<|assistant|>",
12
+ "<|observation|>",
13
+ "<|begin_of_image|>",
14
+ "<|end_of_image|>",
15
+ "<|begin_of_video|>",
16
+ "<|end_of_video|>"
17
+ ],
18
+ "eos_token": {
19
+ "content": "<|endoftext|>",
20
+ "lstrip": false,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ },
25
+ "pad_token": {
26
+ "content": "<|endoftext|>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ }
32
+ }
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/tokenization_chatglm.py ADDED
@@ -0,0 +1,323 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import regex as re
2
+ import base64
3
+ import os
4
+ import json
5
+ import tiktoken
6
+ from torch import TensorType
7
+ from typing import List, Optional, Union, Dict, Any
8
+ from transformers import PreTrainedTokenizer
9
+ from transformers.utils import logging, PaddingStrategy
10
+ from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
11
+
12
+
13
+ class ChatGLM4Tokenizer(PreTrainedTokenizer):
14
+ vocab_files_names = {"vocab_file": "tokenizer.model"}
15
+ model_input_names = ["input_ids", "attention_mask", "position_ids"]
16
+
17
+ def __init__(
18
+ self,
19
+ vocab_file,
20
+ padding_side="left",
21
+ clean_up_tokenization_spaces=False,
22
+ encode_special_tokens=False,
23
+ **kwargs
24
+ ):
25
+ self.name = "GLM4Tokenizer"
26
+ self.vocab_file = vocab_file
27
+ pat_str = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
28
+ self.pat_str = re.compile(pat_str)
29
+ self.encode_special_tokens = encode_special_tokens
30
+
31
+ mergeable_ranks = {}
32
+ with open(vocab_file) as f:
33
+ for line in f:
34
+ token, rank = line.strip().split()
35
+ rank = int(rank)
36
+ token = base64.b64decode(token)
37
+ mergeable_ranks[token] = rank
38
+
39
+ self.mergeable_ranks = mergeable_ranks
40
+
41
+ self.tokenizer = tiktoken.Encoding(
42
+ name="my_tokenizer",
43
+ pat_str=pat_str,
44
+ mergeable_ranks=mergeable_ranks,
45
+ special_tokens={}
46
+ )
47
+ self.decoder = {rank: token for token, rank in mergeable_ranks.items()}
48
+ self.n_words = len(self.decoder)
49
+
50
+ super().__init__(
51
+ padding_side=padding_side,
52
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
53
+ **kwargs
54
+ )
55
+
56
+ @property
57
+ def vocab_size(self):
58
+ return self.n_words
59
+
60
+ def get_vocab(self):
61
+ """ Returns vocab as a dict """
62
+ vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
63
+ vocab.update(self.added_tokens_encoder)
64
+ return vocab
65
+
66
+ def convert_tokens_to_string(self, tokens: List[Union[bytes, str, int]]) -> str:
67
+ """
68
+ Converts a sequence of tokens in a single string.
69
+ """
70
+ text = ""
71
+ temp = b""
72
+ for t in tokens:
73
+ if isinstance(t, int):
74
+ t = chr(t)
75
+ if isinstance(t, str):
76
+ if temp:
77
+ text += temp.decode("utf-8", errors="replace")
78
+ elif isinstance(t, bytes):
79
+ temp += t
80
+ else:
81
+ raise TypeError("token should only be of type int, bytes or str")
82
+ if temp:
83
+ text += temp.decode("utf-8", errors="replace")
84
+ return text
85
+
86
+ def _tokenize(self, text, **kwargs):
87
+ tokens = []
88
+ ids = self.tokenizer.encode(text)
89
+ for t in ids:
90
+ tokens.append(self.decoder[t])
91
+ return tokens
92
+
93
+ def _convert_token_to_id(self, token):
94
+ """ Converts a token (str) in an id using the vocab. """
95
+ return self.mergeable_ranks[token]
96
+
97
+ def _convert_id_to_token(self, index):
98
+ """Converts an index (integer) in a token (str) using the vocab."""
99
+ return self.decoder.get(index, "")
100
+
101
+ def save_vocabulary(self, save_directory, filename_prefix=None):
102
+ """
103
+ Save the vocabulary and special tokens file to a directory.
104
+
105
+ Args:
106
+ save_directory (`str`):
107
+ The directory in which to save the vocabulary.
108
+ filename_prefix (`str`, *optional*):
109
+ An optional prefix to add to the named of the saved files.
110
+
111
+ Returns:
112
+ `Tuple(str)`: Paths to the files saved.
113
+ """
114
+ if os.path.isdir(save_directory):
115
+ vocab_file = os.path.join(
116
+ save_directory, self.vocab_files_names["vocab_file"]
117
+ )
118
+ else:
119
+ vocab_file = save_directory
120
+
121
+ with open(self.vocab_file, 'rb') as fin:
122
+ proto_str = fin.read()
123
+
124
+ with open(vocab_file, "wb") as writer:
125
+ writer.write(proto_str)
126
+
127
+ return (vocab_file,)
128
+
129
+ def get_prefix_tokens(self):
130
+ prefix_tokens = [self.convert_tokens_to_ids("[gMASK]"), self.convert_tokens_to_ids("<sop>")]
131
+ return prefix_tokens
132
+
133
+ def build_single_message(self, role, metadata, message, tokenize=True):
134
+ assert role in ["system", "user", "assistant", "observation"], role
135
+ if tokenize:
136
+ role_tokens = [self.convert_tokens_to_ids(f"<|{role}|>")] + self.tokenizer.encode(f"{metadata}\n",
137
+ disallowed_special=())
138
+ message_tokens = self.tokenizer.encode(message, disallowed_special=())
139
+ tokens = role_tokens + message_tokens
140
+ return tokens
141
+ else:
142
+ return str(f"<|{role}|>{metadata}\n{message}")
143
+
144
+ # Use Jinja Template in tokenizer_config.json
145
+ # def apply_chat_template(
146
+ # self,
147
+ # conversation: Union[List[Dict[str, str]], List[List[Dict[str, str]]], "Conversation"],
148
+ # add_generation_prompt: bool = False,
149
+ # tokenize: bool = True,
150
+ # padding: bool = False,
151
+ # truncation: bool = False,
152
+ # max_length: Optional[int] = None,
153
+ # return_tensors: Optional[Union[str, TensorType]] = None,
154
+ # return_dict: bool = False,
155
+ # tokenizer_kwargs: Optional[Dict[str, Any]] = None,
156
+ # add_special_tokens: bool = True,
157
+ # **kwargs,
158
+ # ) -> Union[str, List[int], List[str], List[List[int]], BatchEncoding]:
159
+ #
160
+ # if return_dict and not tokenize:
161
+ # raise ValueError(
162
+ # "`return_dict=True` is incompatible with `tokenize=False`, because there is no dict "
163
+ # "of tokenizer outputs to return."
164
+ # )
165
+ #
166
+ # def handle_single_conversation(conversation):
167
+ # input_ids = self.get_prefix_tokens() if add_special_tokens else []
168
+ # input_message = "[gMASK]<sop>" if add_special_tokens else ""
169
+ # for item in conversation:
170
+ # if item.get("tools"):
171
+ # tools = item["tools"]
172
+ # content = "你是一个名为 GhatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的,你的任务是针对用户的问题和要求提供适当的答复和支持。"
173
+ # content += "\n\n# 可用工具"
174
+ # for tool in tools:
175
+ # if tool["type"] == "function":
176
+ # function = tool["function"]
177
+ # content += f"\n\n## {function['name']}\n\n{json.dumps(function, ensure_ascii=False, indent=4)}"
178
+ # content += "\n在调用上述函数时,请使用 Json 格式表示调用的参数。"
179
+ # elif tool["type"] == "python":
180
+ # content += "\n\n## python\n\n当你向 `python` 发送包含 Python 代码的消息时,该代码将会在一个有状态的 Jupyter notebook 环境中执行。\n`python` 返回代码执行的输出,或在执行 60 秒后返回超时。\n`/mnt/data` 将会持久化存储你的文件。在此会话中,`python` 无法访问互联网。不要使用 `python` 进行任何网络请求或者在线 API 调用,这些在线内容的访问将不会成功。"
181
+ # elif tool["type"] == "simple_browser":
182
+ # content += "\n\n## simple_browser\n\n你可以使用 `simple_browser` 工具。该工具支持以下函数:\n`search(query: str, recency_days: int)`:使用搜索引擎进行查询并显示结果,可以使用 `recency_days` 参数控制搜索内容的时效性。\n`mclick(ids: list[int])`:获取一系列指定 id 的页面内容。每次调用时,须选择3-10个页面。选择多个角度的页面,同时尽可能选择可信任的信息来源。考虑到部分页面是无法加载的,你也可以多打开一些可能有用的页面而不用担心内容过多。\n`open_url(url: str)`:打开指定的 URL。\n\n使用 `【{引用 id}†{引用文本}】` 来引用内容。\n\n操作步骤:1. 使用 `search` 来获得信息列表; 2. 使用 `mclick` 来获取指定 ID 页面的内容; 3. 根据获得的内容进行回复。在回复中应当引用信息来源。\n 如果用户提供了 URL,也可以用 `open_url` 直接打开页面。\n如果初次搜索结果没有找到合适的信息,也可以再次使用 `search` 进行搜索。"
183
+ # elif tool["type"] == "cogview":
184
+ # content += "\n\n## cogview\n\n如果用户的请求中包含了对图像的描述,你可以使用 `cogview` 来生成图像并展示给用户。你需要向 `cogview` 发送图像描述,规则:\n- 发送给 `cogview` 的消息必须使用英语。用户的中文描述必须完全翻译为英语。\n- 应当尽可能详细地描述图像生成的需求,需求描述约 100 英文单词。\n- 保持用户原始描述的意图。不要虚构内容或者没见过的人物。\n- 如无特殊说明,所在地为中国,持有中国立场并遵循中国社会主义价值观。"
185
+ # else:
186
+ # raise NotImplementedError(f"Unknown tool type {tool['type']}")
187
+ # input = self.build_single_message("system", "", content, tokenize=tokenize)
188
+ # if tokenize:
189
+ # input_ids.extend(input)
190
+ # else:
191
+ # input_message += input
192
+ # if item["content"]:
193
+ # input = self.build_single_message(
194
+ # item["role"],
195
+ # item.get("metadata", ""),
196
+ # item["content"],
197
+ # tokenize=tokenize
198
+ # )
199
+ # if tokenize:
200
+ # input_ids.extend(input)
201
+ # else:
202
+ # input_message += input
203
+ # if add_generation_prompt:
204
+ # if tokenize:
205
+ # input_ids.extend([self.convert_tokens_to_ids("<|assistant|>")])
206
+ # else:
207
+ # input_message += "<|assistant|>"
208
+ # return input_ids if tokenize else input_message
209
+ #
210
+ # # Main logic to handle different conversation formats
211
+ # if isinstance(conversation, list) and all(isinstance(i, dict) for i in conversation):
212
+ # result = handle_single_conversation(conversation)
213
+ # elif isinstance(conversation, list) and all(isinstance(i, list) for i in conversation):
214
+ # result = [handle_single_conversation(c) for c in conversation]
215
+ # elif hasattr(conversation, "messages"):
216
+ # result = handle_single_conversation(conversation.messages)
217
+ # else:
218
+ # raise ValueError("Invalid conversation format")
219
+ #
220
+ # if tokenize:
221
+ # output = self.batch_encode_plus(
222
+ # [result] if isinstance(result[0], int) else result,
223
+ # padding=padding,
224
+ # truncation=truncation,
225
+ # max_length=max_length,
226
+ # return_tensors=return_tensors,
227
+ # is_split_into_words=True,
228
+ # add_special_tokens=False
229
+ # )
230
+ # if return_dict:
231
+ # return output
232
+ # else:
233
+ # return output["input_ids"]
234
+ # else:
235
+ # return result
236
+
237
+ def build_inputs_with_special_tokens(
238
+ self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
239
+ ) -> List[int]:
240
+ """
241
+ Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
242
+ adding special tokens. A BERT sequence has the following format:
243
+
244
+ - single sequence: `[CLS] X [SEP]`
245
+ - pair of sequences: `[CLS] A [SEP] B [SEP]`
246
+
247
+ Args:
248
+ token_ids_0 (`List[int]`):
249
+ List of IDs to which the special tokens will be added.
250
+ token_ids_1 (`List[int]`, *optional*):
251
+ Optional second list of IDs for sequence pairs.
252
+
253
+ Returns:
254
+ `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
255
+ """
256
+ prefix_tokens = self.get_prefix_tokens()
257
+ token_ids_0 = prefix_tokens + token_ids_0
258
+ if token_ids_1 is not None:
259
+ token_ids_0 = token_ids_0 + token_ids_1 + [self.convert_tokens_to_ids("<eos>")]
260
+ return token_ids_0
261
+
262
+ def _pad(
263
+ self,
264
+ encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
265
+ max_length: Optional[int] = None,
266
+ padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
267
+ pad_to_multiple_of: Optional[int] = None,
268
+ return_attention_mask: Optional[bool] = None,
269
+ ) -> dict:
270
+ """
271
+ Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
272
+
273
+ Args:
274
+ encoded_inputs:
275
+ Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
276
+ max_length: maximum length of the returned list and optionally padding length (see below).
277
+ Will truncate by taking into account the special tokens.
278
+ padding_strategy: PaddingStrategy to use for padding.
279
+
280
+ - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
281
+ - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
282
+ - PaddingStrategy.DO_NOT_PAD: Do not pad
283
+ The tokenizer padding sides are defined in self.padding_side:
284
+
285
+ - 'left': pads on the left of the sequences
286
+ - 'right': pads on the right of the sequences
287
+ pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
288
+ This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
289
+ `>= 7.5` (Volta).
290
+ return_attention_mask:
291
+ (optional) Set to False to avoid returning attention mask (default: set to model specifics)
292
+ """
293
+ # Load from model defaults
294
+ assert self.padding_side == "left"
295
+
296
+ required_input = encoded_inputs[self.model_input_names[0]]
297
+ seq_length = len(required_input)
298
+
299
+ if padding_strategy == PaddingStrategy.LONGEST:
300
+ max_length = len(required_input)
301
+
302
+ if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
303
+ max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
304
+
305
+ needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
306
+
307
+ # Initialize attention mask if not present.
308
+ if "attention_mask" not in encoded_inputs:
309
+ encoded_inputs["attention_mask"] = [1] * seq_length
310
+
311
+ if "position_ids" not in encoded_inputs:
312
+ encoded_inputs["position_ids"] = list(range(seq_length))
313
+
314
+ if needs_to_be_padded:
315
+ difference = max_length - len(required_input)
316
+
317
+ if "attention_mask" in encoded_inputs:
318
+ encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
319
+ if "position_ids" in encoded_inputs:
320
+ encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
321
+ encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
322
+
323
+ return encoded_inputs
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5a493598071550244b2ee7f26118f3edec2150b9dfa967929a99052ac83fe716
3
+ size 2623634
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/tokenizer_config.json ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "151329": {
4
+ "content": "<|endoftext|>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "151330": {
12
+ "content": "[MASK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "151331": {
20
+ "content": "[gMASK]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "151332": {
28
+ "content": "[sMASK]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "151333": {
36
+ "content": "<sop>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "151334": {
44
+ "content": "<eop>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "151335": {
52
+ "content": "<|system|>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ },
59
+ "151336": {
60
+ "content": "<|user|>",
61
+ "lstrip": false,
62
+ "normalized": false,
63
+ "rstrip": false,
64
+ "single_word": false,
65
+ "special": true
66
+ },
67
+ "151337": {
68
+ "content": "<|assistant|>",
69
+ "lstrip": false,
70
+ "normalized": false,
71
+ "rstrip": false,
72
+ "single_word": false,
73
+ "special": true
74
+ },
75
+ "151338": {
76
+ "content": "<|observation|>",
77
+ "lstrip": false,
78
+ "normalized": false,
79
+ "rstrip": false,
80
+ "single_word": false,
81
+ "special": true
82
+ },
83
+ "151339": {
84
+ "content": "<|begin_of_image|>",
85
+ "lstrip": false,
86
+ "normalized": false,
87
+ "rstrip": false,
88
+ "single_word": false,
89
+ "special": true
90
+ },
91
+ "151340": {
92
+ "content": "<|end_of_image|>",
93
+ "lstrip": false,
94
+ "normalized": false,
95
+ "rstrip": false,
96
+ "single_word": false,
97
+ "special": true
98
+ },
99
+ "151341": {
100
+ "content": "<|begin_of_video|>",
101
+ "lstrip": false,
102
+ "normalized": false,
103
+ "rstrip": false,
104
+ "single_word": false,
105
+ "special": true
106
+ },
107
+ "151342": {
108
+ "content": "<|end_of_video|>",
109
+ "lstrip": false,
110
+ "normalized": false,
111
+ "rstrip": false,
112
+ "single_word": false,
113
+ "special": true
114
+ }
115
+ },
116
+ "additional_special_tokens": [
117
+ "<|endoftext|>",
118
+ "[MASK]",
119
+ "[gMASK]",
120
+ "[sMASK]",
121
+ "<sop>",
122
+ "<eop>",
123
+ "<|system|>",
124
+ "<|user|>",
125
+ "<|assistant|>",
126
+ "<|observation|>",
127
+ "<|begin_of_image|>",
128
+ "<|end_of_image|>",
129
+ "<|begin_of_video|>",
130
+ "<|end_of_video|>"
131
+ ],
132
+ "auto_map": {
133
+ "AutoTokenizer": [
134
+ "tokenization_chatglm.ChatGLM4Tokenizer",
135
+ null
136
+ ]
137
+ },
138
+ "chat_template": "{{ '[gMASK]<sop>' }}{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{% endif %}{% if system_message is defined %}{{ '<|system|>\n' + system_message }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|user|>\n' + content + '<|assistant|>' }}{% elif message['role'] == 'assistant' %}{{ '\n' + content }}{% endif %}{% endfor %}",
139
+ "clean_up_tokenization_spaces": false,
140
+ "do_lower_case": false,
141
+ "eos_token": "<|endoftext|>",
142
+ "model_max_length": 1024000,
143
+ "pad_token": "<|endoftext|>",
144
+ "padding_side": "right",
145
+ "remove_space": false,
146
+ "split_special_tokens": false,
147
+ "tokenizer_class": "ChatGLM4Tokenizer"
148
+ }
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/trainer_state.json ADDED
@@ -0,0 +1,424 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_metric": null,
3
+ "best_model_checkpoint": null,
4
+ "epoch": 2.9850746268656714,
5
+ "eval_steps": 175,
6
+ "global_step": 525,
7
+ "is_hyper_param_search": false,
8
+ "is_local_process_zero": true,
9
+ "is_world_process_zero": true,
10
+ "log_history": [
11
+ {
12
+ "epoch": 0.05685856432125089,
13
+ "grad_norm": 2.919694662094116,
14
+ "learning_rate": 1.4285714285714285e-05,
15
+ "loss": 3.8009,
16
+ "step": 10
17
+ },
18
+ {
19
+ "epoch": 0.11371712864250177,
20
+ "grad_norm": 3.130059003829956,
21
+ "learning_rate": 2.857142857142857e-05,
22
+ "loss": 0.3289,
23
+ "step": 20
24
+ },
25
+ {
26
+ "epoch": 0.17057569296375266,
27
+ "grad_norm": 1.6621949672698975,
28
+ "learning_rate": 4.2857142857142856e-05,
29
+ "loss": 0.2598,
30
+ "step": 30
31
+ },
32
+ {
33
+ "epoch": 0.22743425728500355,
34
+ "grad_norm": 2.371370792388916,
35
+ "learning_rate": 5.714285714285714e-05,
36
+ "loss": 0.2401,
37
+ "step": 40
38
+ },
39
+ {
40
+ "epoch": 0.28429282160625446,
41
+ "grad_norm": 0.8625539541244507,
42
+ "learning_rate": 7.142857142857143e-05,
43
+ "loss": 0.2306,
44
+ "step": 50
45
+ },
46
+ {
47
+ "epoch": 0.3411513859275053,
48
+ "grad_norm": 5.281027793884277,
49
+ "learning_rate": 8.571428571428571e-05,
50
+ "loss": 0.2463,
51
+ "step": 60
52
+ },
53
+ {
54
+ "epoch": 0.39800995024875624,
55
+ "grad_norm": 1.4231479167938232,
56
+ "learning_rate": 0.0001,
57
+ "loss": 0.2291,
58
+ "step": 70
59
+ },
60
+ {
61
+ "epoch": 0.4548685145700071,
62
+ "grad_norm": 1.9341018199920654,
63
+ "learning_rate": 9.993784606094612e-05,
64
+ "loss": 0.2301,
65
+ "step": 80
66
+ },
67
+ {
68
+ "epoch": 0.511727078891258,
69
+ "grad_norm": 2.0011138916015625,
70
+ "learning_rate": 9.975153876827008e-05,
71
+ "loss": 0.2207,
72
+ "step": 90
73
+ },
74
+ {
75
+ "epoch": 0.5685856432125089,
76
+ "grad_norm": 0.6652698516845703,
77
+ "learning_rate": 9.944154131125642e-05,
78
+ "loss": 0.2271,
79
+ "step": 100
80
+ },
81
+ {
82
+ "epoch": 0.6254442075337597,
83
+ "grad_norm": 1.3393596410751343,
84
+ "learning_rate": 9.900862439242719e-05,
85
+ "loss": 0.216,
86
+ "step": 110
87
+ },
88
+ {
89
+ "epoch": 0.6823027718550106,
90
+ "grad_norm": 5.120615482330322,
91
+ "learning_rate": 9.84538643114539e-05,
92
+ "loss": 0.2311,
93
+ "step": 120
94
+ },
95
+ {
96
+ "epoch": 0.7391613361762616,
97
+ "grad_norm": 1.72005033493042,
98
+ "learning_rate": 9.777864028930705e-05,
99
+ "loss": 0.212,
100
+ "step": 130
101
+ },
102
+ {
103
+ "epoch": 0.7960199004975125,
104
+ "grad_norm": 1.5774250030517578,
105
+ "learning_rate": 9.698463103929542e-05,
106
+ "loss": 0.2261,
107
+ "step": 140
108
+ },
109
+ {
110
+ "epoch": 0.8528784648187633,
111
+ "grad_norm": 93.82937622070312,
112
+ "learning_rate": 9.607381059352038e-05,
113
+ "loss": 2.368,
114
+ "step": 150
115
+ },
116
+ {
117
+ "epoch": 0.9097370291400142,
118
+ "grad_norm": 11.905146598815918,
119
+ "learning_rate": 9.504844339512095e-05,
120
+ "loss": 2.4337,
121
+ "step": 160
122
+ },
123
+ {
124
+ "epoch": 0.9665955934612651,
125
+ "grad_norm": 4.584591388702393,
126
+ "learning_rate": 9.391107866851143e-05,
127
+ "loss": 0.8958,
128
+ "step": 170
129
+ },
130
+ {
131
+ "epoch": 0.9950248756218906,
132
+ "eval_accuracy": 0.7612800000000001,
133
+ "eval_loss": 0.4472738206386566,
134
+ "eval_runtime": 136.3095,
135
+ "eval_samples_per_second": 18.341,
136
+ "eval_steps_per_second": 18.341,
137
+ "step": 175
138
+ },
139
+ {
140
+ "epoch": 1.023454157782516,
141
+ "grad_norm": 4.700326919555664,
142
+ "learning_rate": 9.266454408160779e-05,
143
+ "loss": 0.4271,
144
+ "step": 180
145
+ },
146
+ {
147
+ "epoch": 1.080312722103767,
148
+ "grad_norm": 1.9691985845565796,
149
+ "learning_rate": 9.131193871579975e-05,
150
+ "loss": 0.2324,
151
+ "step": 190
152
+ },
153
+ {
154
+ "epoch": 1.1371712864250179,
155
+ "grad_norm": 2.0088887214660645,
156
+ "learning_rate": 8.985662536114613e-05,
157
+ "loss": 0.1996,
158
+ "step": 200
159
+ },
160
+ {
161
+ "epoch": 1.1940298507462686,
162
+ "grad_norm": 6.571439266204834,
163
+ "learning_rate": 8.83022221559489e-05,
164
+ "loss": 0.1961,
165
+ "step": 210
166
+ },
167
+ {
168
+ "epoch": 1.2508884150675195,
169
+ "grad_norm": 3.290731191635132,
170
+ "learning_rate": 8.665259359149132e-05,
171
+ "loss": 0.1942,
172
+ "step": 220
173
+ },
174
+ {
175
+ "epoch": 1.3077469793887704,
176
+ "grad_norm": 1.540110468864441,
177
+ "learning_rate": 8.491184090430364e-05,
178
+ "loss": 0.2152,
179
+ "step": 230
180
+ },
181
+ {
182
+ "epoch": 1.3646055437100213,
183
+ "grad_norm": 3.2633206844329834,
184
+ "learning_rate": 8.308429187984297e-05,
185
+ "loss": 0.1986,
186
+ "step": 240
187
+ },
188
+ {
189
+ "epoch": 1.4214641080312722,
190
+ "grad_norm": 2.2755699157714844,
191
+ "learning_rate": 8.117449009293668e-05,
192
+ "loss": 0.209,
193
+ "step": 250
194
+ },
195
+ {
196
+ "epoch": 1.4783226723525231,
197
+ "grad_norm": 1.3403345346450806,
198
+ "learning_rate": 7.91871836117395e-05,
199
+ "loss": 0.191,
200
+ "step": 260
201
+ },
202
+ {
203
+ "epoch": 1.535181236673774,
204
+ "grad_norm": 2.5068321228027344,
205
+ "learning_rate": 7.712731319328798e-05,
206
+ "loss": 0.2046,
207
+ "step": 270
208
+ },
209
+ {
210
+ "epoch": 1.5920398009950247,
211
+ "grad_norm": 1.23170804977417,
212
+ "learning_rate": 7.500000000000001e-05,
213
+ "loss": 0.1939,
214
+ "step": 280
215
+ },
216
+ {
217
+ "epoch": 1.6488983653162759,
218
+ "grad_norm": 1.3532381057739258,
219
+ "learning_rate": 7.281053286765815e-05,
220
+ "loss": 0.2072,
221
+ "step": 290
222
+ },
223
+ {
224
+ "epoch": 1.7057569296375266,
225
+ "grad_norm": 1.6122020483016968,
226
+ "learning_rate": 7.056435515653059e-05,
227
+ "loss": 0.1934,
228
+ "step": 300
229
+ },
230
+ {
231
+ "epoch": 1.7626154939587777,
232
+ "grad_norm": 1.4875357151031494,
233
+ "learning_rate": 6.826705121831976e-05,
234
+ "loss": 0.1782,
235
+ "step": 310
236
+ },
237
+ {
238
+ "epoch": 1.8194740582800284,
239
+ "grad_norm": 1.375095248222351,
240
+ "learning_rate": 6.592433251258423e-05,
241
+ "loss": 0.1879,
242
+ "step": 320
243
+ },
244
+ {
245
+ "epoch": 1.8763326226012793,
246
+ "grad_norm": 1.3681703805923462,
247
+ "learning_rate": 6.354202340715026e-05,
248
+ "loss": 0.1862,
249
+ "step": 330
250
+ },
251
+ {
252
+ "epoch": 1.9331911869225302,
253
+ "grad_norm": 1.6180658340454102,
254
+ "learning_rate": 6.112604669781572e-05,
255
+ "loss": 0.2014,
256
+ "step": 340
257
+ },
258
+ {
259
+ "epoch": 1.9900497512437811,
260
+ "grad_norm": 1.4774919748306274,
261
+ "learning_rate": 5.868240888334653e-05,
262
+ "loss": 0.1917,
263
+ "step": 350
264
+ },
265
+ {
266
+ "epoch": 1.9900497512437811,
267
+ "eval_accuracy": 0.9307333333333332,
268
+ "eval_loss": 0.18561618030071259,
269
+ "eval_runtime": 136.2276,
270
+ "eval_samples_per_second": 18.352,
271
+ "eval_steps_per_second": 18.352,
272
+ "step": 350
273
+ },
274
+ {
275
+ "epoch": 2.046908315565032,
276
+ "grad_norm": 1.195068597793579,
277
+ "learning_rate": 5.621718523237427e-05,
278
+ "loss": 0.1637,
279
+ "step": 360
280
+ },
281
+ {
282
+ "epoch": 2.1037668798862827,
283
+ "grad_norm": 1.4060101509094238,
284
+ "learning_rate": 5.373650467932122e-05,
285
+ "loss": 0.1464,
286
+ "step": 370
287
+ },
288
+ {
289
+ "epoch": 2.160625444207534,
290
+ "grad_norm": 1.7987910509109497,
291
+ "learning_rate": 5.124653458690365e-05,
292
+ "loss": 0.1375,
293
+ "step": 380
294
+ },
295
+ {
296
+ "epoch": 2.2174840085287846,
297
+ "grad_norm": 1.1465859413146973,
298
+ "learning_rate": 4.875346541309637e-05,
299
+ "loss": 0.1509,
300
+ "step": 390
301
+ },
302
+ {
303
+ "epoch": 2.2743425728500357,
304
+ "grad_norm": 1.3375942707061768,
305
+ "learning_rate": 4.626349532067879e-05,
306
+ "loss": 0.1388,
307
+ "step": 400
308
+ },
309
+ {
310
+ "epoch": 2.3312011371712864,
311
+ "grad_norm": 1.257914662361145,
312
+ "learning_rate": 4.378281476762576e-05,
313
+ "loss": 0.1493,
314
+ "step": 410
315
+ },
316
+ {
317
+ "epoch": 2.388059701492537,
318
+ "grad_norm": 0.9045670628547668,
319
+ "learning_rate": 4.131759111665349e-05,
320
+ "loss": 0.1602,
321
+ "step": 420
322
+ },
323
+ {
324
+ "epoch": 2.4449182658137882,
325
+ "grad_norm": 1.2219940423965454,
326
+ "learning_rate": 3.887395330218429e-05,
327
+ "loss": 0.1636,
328
+ "step": 430
329
+ },
330
+ {
331
+ "epoch": 2.501776830135039,
332
+ "grad_norm": 1.0463968515396118,
333
+ "learning_rate": 3.6457976592849754e-05,
334
+ "loss": 0.1428,
335
+ "step": 440
336
+ },
337
+ {
338
+ "epoch": 2.55863539445629,
339
+ "grad_norm": 1.3076916933059692,
340
+ "learning_rate": 3.4075667487415785e-05,
341
+ "loss": 0.1569,
342
+ "step": 450
343
+ },
344
+ {
345
+ "epoch": 2.6154939587775408,
346
+ "grad_norm": 0.6994168758392334,
347
+ "learning_rate": 3.173294878168025e-05,
348
+ "loss": 0.151,
349
+ "step": 460
350
+ },
351
+ {
352
+ "epoch": 2.672352523098792,
353
+ "grad_norm": 1.2010096311569214,
354
+ "learning_rate": 2.9435644843469436e-05,
355
+ "loss": 0.1429,
356
+ "step": 470
357
+ },
358
+ {
359
+ "epoch": 2.7292110874200426,
360
+ "grad_norm": 0.9571990370750427,
361
+ "learning_rate": 2.718946713234185e-05,
362
+ "loss": 0.1446,
363
+ "step": 480
364
+ },
365
+ {
366
+ "epoch": 2.7860696517412933,
367
+ "grad_norm": 0.9076853394508362,
368
+ "learning_rate": 2.500000000000001e-05,
369
+ "loss": 0.1387,
370
+ "step": 490
371
+ },
372
+ {
373
+ "epoch": 2.8429282160625444,
374
+ "grad_norm": 1.4032243490219116,
375
+ "learning_rate": 2.2872686806712035e-05,
376
+ "loss": 0.1593,
377
+ "step": 500
378
+ },
379
+ {
380
+ "epoch": 2.8997867803837956,
381
+ "grad_norm": 0.90634685754776,
382
+ "learning_rate": 2.0812816388260518e-05,
383
+ "loss": 0.1376,
384
+ "step": 510
385
+ },
386
+ {
387
+ "epoch": 2.9566453447050463,
388
+ "grad_norm": 1.0794312953948975,
389
+ "learning_rate": 1.8825509907063327e-05,
390
+ "loss": 0.1287,
391
+ "step": 520
392
+ },
393
+ {
394
+ "epoch": 2.9850746268656714,
395
+ "eval_accuracy": 0.9336866666666667,
396
+ "eval_loss": 0.1812731772661209,
397
+ "eval_runtime": 136.2237,
398
+ "eval_samples_per_second": 18.352,
399
+ "eval_steps_per_second": 18.352,
400
+ "step": 525
401
+ }
402
+ ],
403
+ "logging_steps": 10,
404
+ "max_steps": 700,
405
+ "num_input_tokens_seen": 0,
406
+ "num_train_epochs": 4,
407
+ "save_steps": 175,
408
+ "stateful_callbacks": {
409
+ "TrainerControl": {
410
+ "args": {
411
+ "should_epoch_stop": false,
412
+ "should_evaluate": false,
413
+ "should_log": false,
414
+ "should_save": true,
415
+ "should_training_stop": false
416
+ },
417
+ "attributes": {}
418
+ }
419
+ },
420
+ "total_flos": 1.3072222611324273e+18,
421
+ "train_batch_size": 16,
422
+ "trial_name": null,
423
+ "trial_params": null
424
+ }
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-525/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9e6f3241dabbb8a63b52024fb0fb0d68c85f4b48b07e8579ebd8f41fe5fd662
3
+ size 5304
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-700/README.md ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: peft
3
+ base_model: THUDM/glm-4-9b-chat-1m
4
+ ---
5
+
6
+ # Model Card for Model ID
7
+
8
+ <!-- Provide a quick summary of what the model is/does. -->
9
+
10
+
11
+
12
+ ## Model Details
13
+
14
+ ### Model Description
15
+
16
+ <!-- Provide a longer summary of what this model is. -->
17
+
18
+
19
+
20
+ - **Developed by:** [More Information Needed]
21
+ - **Funded by [optional]:** [More Information Needed]
22
+ - **Shared by [optional]:** [More Information Needed]
23
+ - **Model type:** [More Information Needed]
24
+ - **Language(s) (NLP):** [More Information Needed]
25
+ - **License:** [More Information Needed]
26
+ - **Finetuned from model [optional]:** [More Information Needed]
27
+
28
+ ### Model Sources [optional]
29
+
30
+ <!-- Provide the basic links for the model. -->
31
+
32
+ - **Repository:** [More Information Needed]
33
+ - **Paper [optional]:** [More Information Needed]
34
+ - **Demo [optional]:** [More Information Needed]
35
+
36
+ ## Uses
37
+
38
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
39
+
40
+ ### Direct Use
41
+
42
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
43
+
44
+ [More Information Needed]
45
+
46
+ ### Downstream Use [optional]
47
+
48
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
49
+
50
+ [More Information Needed]
51
+
52
+ ### Out-of-Scope Use
53
+
54
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
55
+
56
+ [More Information Needed]
57
+
58
+ ## Bias, Risks, and Limitations
59
+
60
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
61
+
62
+ [More Information Needed]
63
+
64
+ ### Recommendations
65
+
66
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
67
+
68
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
69
+
70
+ ## How to Get Started with the Model
71
+
72
+ Use the code below to get started with the model.
73
+
74
+ [More Information Needed]
75
+
76
+ ## Training Details
77
+
78
+ ### Training Data
79
+
80
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
81
+
82
+ [More Information Needed]
83
+
84
+ ### Training Procedure
85
+
86
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
87
+
88
+ #### Preprocessing [optional]
89
+
90
+ [More Information Needed]
91
+
92
+
93
+ #### Training Hyperparameters
94
+
95
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
96
+
97
+ #### Speeds, Sizes, Times [optional]
98
+
99
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
100
+
101
+ [More Information Needed]
102
+
103
+ ## Evaluation
104
+
105
+ <!-- This section describes the evaluation protocols and provides the results. -->
106
+
107
+ ### Testing Data, Factors & Metrics
108
+
109
+ #### Testing Data
110
+
111
+ <!-- This should link to a Dataset Card if possible. -->
112
+
113
+ [More Information Needed]
114
+
115
+ #### Factors
116
+
117
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
118
+
119
+ [More Information Needed]
120
+
121
+ #### Metrics
122
+
123
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
124
+
125
+ [More Information Needed]
126
+
127
+ ### Results
128
+
129
+ [More Information Needed]
130
+
131
+ #### Summary
132
+
133
+
134
+
135
+ ## Model Examination [optional]
136
+
137
+ <!-- Relevant interpretability work for the model goes here -->
138
+
139
+ [More Information Needed]
140
+
141
+ ## Environmental Impact
142
+
143
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
144
+
145
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
146
+
147
+ - **Hardware Type:** [More Information Needed]
148
+ - **Hours used:** [More Information Needed]
149
+ - **Cloud Provider:** [More Information Needed]
150
+ - **Compute Region:** [More Information Needed]
151
+ - **Carbon Emitted:** [More Information Needed]
152
+
153
+ ## Technical Specifications [optional]
154
+
155
+ ### Model Architecture and Objective
156
+
157
+ [More Information Needed]
158
+
159
+ ### Compute Infrastructure
160
+
161
+ [More Information Needed]
162
+
163
+ #### Hardware
164
+
165
+ [More Information Needed]
166
+
167
+ #### Software
168
+
169
+ [More Information Needed]
170
+
171
+ ## Citation [optional]
172
+
173
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
174
+
175
+ **BibTeX:**
176
+
177
+ [More Information Needed]
178
+
179
+ **APA:**
180
+
181
+ [More Information Needed]
182
+
183
+ ## Glossary [optional]
184
+
185
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
186
+
187
+ [More Information Needed]
188
+
189
+ ## More Information [optional]
190
+
191
+ [More Information Needed]
192
+
193
+ ## Model Card Authors [optional]
194
+
195
+ [More Information Needed]
196
+
197
+ ## Model Card Contact
198
+
199
+ [More Information Needed]
200
+ ### Framework versions
201
+
202
+ - PEFT 0.11.1
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-700/adapter_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "THUDM/glm-4-9b-chat-1m",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 16,
14
+ "lora_dropout": 0.0,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 8,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "dense_h_to_4h",
24
+ "query_key_value",
25
+ "dense_4h_to_h",
26
+ "dense"
27
+ ],
28
+ "task_type": "CAUSAL_LM",
29
+ "use_dora": false,
30
+ "use_rslora": false
31
+ }
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-700/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:779d919e4e576eb536f72ff440fea92eb01a8b6522a276d586a48fc2f24d1fd2
3
+ size 85409560
llama-factory/saves/glm-4-9b/lora/sft_bf16_p1_full/checkpoint-700/added_tokens.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<eop>": 151334,
3
+ "<sop>": 151333,
4
+ "<|assistant|>": 151337,
5
+ "<|begin_of_image|>": 151339,
6
+ "<|begin_of_video|>": 151341,
7
+ "<|end_of_image|>": 151340,
8
+ "<|end_of_video|>": 151342,
9
+ "<|endoftext|>": 151329,
10
+ "<|observation|>": 151338,
11
+ "<|system|>": 151335,
12
+ "<|user|>": 151336,
13
+ "[MASK]": 151330,
14
+ "[gMASK]": 151331,
15
+ "[sMASK]": 151332
16
+ }