czczup commited on
Commit
1e37f9f
·
verified ·
1 Parent(s): acf5724

Update model code & README.md

Browse files
config.json CHANGED
@@ -1,6 +1,5 @@
1
  {
2
  "_commit_hash": null,
3
- "_name_or_path": "ckpt/OpenGVLab/InternVL2-8B",
4
  "architectures": [
5
  "InternVLChatModel"
6
  ],
@@ -12,9 +11,8 @@
12
  "downsample_ratio": 0.5,
13
  "dynamic_image_size": true,
14
  "force_image_size": 448,
15
- "hidden_size": 4096,
16
  "llm_config": {
17
- "_name_or_path": "./pretrained/internlm2_5-7b-chat",
18
  "add_cross_attention": false,
19
  "architectures": [
20
  "InternLM2ForCausalLM"
@@ -94,109 +92,52 @@
94
  "tie_word_embeddings": false,
95
  "tokenizer_class": null,
96
  "top_k": 50,
97
- "top_p": null,
98
  "torch_dtype": "bfloat16",
99
  "torchscript": false,
100
  "transformers_version": "4.37.2",
101
  "typical_p": 1.0,
102
  "use_bfloat16": true,
103
- "use_cache": false,
104
  "vocab_size": 92553
105
  },
106
  "max_dynamic_patch": 12,
107
  "min_dynamic_patch": 1,
108
  "model_type": "internvl_chat",
109
- "pad2square": false,
110
  "ps_version": "v2",
111
  "select_layer": -1,
112
  "template": "internlm2-chat",
113
- "tie_word_embeddings": false,
114
  "torch_dtype": "bfloat16",
115
- "transformers_version": null,
116
  "use_backbone_lora": 0,
117
  "use_llm_lora": 0,
118
  "use_thumbnail": true,
119
  "vision_config": {
120
- "_name_or_path": "",
121
- "add_cross_attention": false,
122
  "architectures": [
123
  "InternVisionModel"
124
  ],
125
  "attention_dropout": 0.0,
126
- "bad_words_ids": null,
127
- "begin_suppress_tokens": null,
128
- "bos_token_id": null,
129
- "chunk_size_feed_forward": 0,
130
- "cross_attention_hidden_size": null,
131
- "decoder_start_token_id": null,
132
- "diversity_penalty": 0.0,
133
- "do_sample": false,
134
- "drop_path_rate": 0.1,
135
  "dropout": 0.0,
136
- "early_stopping": false,
137
- "encoder_no_repeat_ngram_size": 0,
138
- "eos_token_id": null,
139
- "exponential_decay_length_penalty": null,
140
- "finetuning_task": null,
141
- "forced_bos_token_id": null,
142
- "forced_eos_token_id": null,
143
  "hidden_act": "gelu",
144
  "hidden_size": 1024,
145
- "id2label": {
146
- "0": "LABEL_0",
147
- "1": "LABEL_1"
148
- },
149
  "image_size": 448,
150
  "initializer_factor": 1.0,
151
  "initializer_range": 0.02,
152
  "intermediate_size": 4096,
153
- "is_decoder": false,
154
- "is_encoder_decoder": false,
155
- "label2id": {
156
- "LABEL_0": 0,
157
- "LABEL_1": 1
158
- },
159
  "layer_norm_eps": 1e-06,
160
- "length_penalty": 1.0,
161
- "max_length": 20,
162
- "min_length": 0,
163
  "model_type": "intern_vit_6b",
164
- "no_repeat_ngram_size": 0,
165
  "norm_type": "layer_norm",
166
  "num_attention_heads": 16,
167
- "num_beam_groups": 1,
168
- "num_beams": 1,
169
  "num_channels": 3,
170
  "num_hidden_layers": 24,
171
- "num_return_sequences": 1,
172
  "output_attentions": false,
173
  "output_hidden_states": false,
174
- "output_scores": false,
175
- "pad_token_id": null,
176
  "patch_size": 14,
177
- "prefix": null,
178
- "problem_type": null,
179
- "pruned_heads": {},
180
  "qk_normalization": false,
181
  "qkv_bias": true,
182
- "remove_invalid_values": false,
183
- "repetition_penalty": 1.0,
184
  "return_dict": true,
185
- "return_dict_in_generate": false,
186
- "sep_token_id": null,
187
- "suppress_tokens": null,
188
- "task_specific_params": null,
189
- "temperature": 1.0,
190
- "tf_legacy_loss": false,
191
- "tie_encoder_decoder": false,
192
- "tie_word_embeddings": true,
193
- "tokenizer_class": null,
194
- "top_k": 50,
195
- "top_p": null,
196
  "torch_dtype": "bfloat16",
197
- "torchscript": false,
198
  "transformers_version": "4.37.2",
199
- "typical_p": 1.0,
200
  "use_bfloat16": true,
201
  "use_flash_attn": true
202
  }
 
1
  {
2
  "_commit_hash": null,
 
3
  "architectures": [
4
  "InternVLChatModel"
5
  ],
 
11
  "downsample_ratio": 0.5,
12
  "dynamic_image_size": true,
13
  "force_image_size": 448,
 
14
  "llm_config": {
15
+ "_name_or_path": "internlm/internlm2_5-7b-chat",
16
  "add_cross_attention": false,
17
  "architectures": [
18
  "InternLM2ForCausalLM"
 
92
  "tie_word_embeddings": false,
93
  "tokenizer_class": null,
94
  "top_k": 50,
95
+ "top_p": 1.0,
96
  "torch_dtype": "bfloat16",
97
  "torchscript": false,
98
  "transformers_version": "4.37.2",
99
  "typical_p": 1.0,
100
  "use_bfloat16": true,
101
+ "use_cache": true,
102
  "vocab_size": 92553
103
  },
104
  "max_dynamic_patch": 12,
105
  "min_dynamic_patch": 1,
106
  "model_type": "internvl_chat",
 
107
  "ps_version": "v2",
108
  "select_layer": -1,
109
  "template": "internlm2-chat",
 
110
  "torch_dtype": "bfloat16",
 
111
  "use_backbone_lora": 0,
112
  "use_llm_lora": 0,
113
  "use_thumbnail": true,
114
  "vision_config": {
 
 
115
  "architectures": [
116
  "InternVisionModel"
117
  ],
118
  "attention_dropout": 0.0,
119
+ "drop_path_rate": 0.0,
 
 
 
 
 
 
 
 
120
  "dropout": 0.0,
 
 
 
 
 
 
 
121
  "hidden_act": "gelu",
122
  "hidden_size": 1024,
 
 
 
 
123
  "image_size": 448,
124
  "initializer_factor": 1.0,
125
  "initializer_range": 0.02,
126
  "intermediate_size": 4096,
 
 
 
 
 
 
127
  "layer_norm_eps": 1e-06,
 
 
 
128
  "model_type": "intern_vit_6b",
 
129
  "norm_type": "layer_norm",
130
  "num_attention_heads": 16,
 
 
131
  "num_channels": 3,
132
  "num_hidden_layers": 24,
 
133
  "output_attentions": false,
134
  "output_hidden_states": false,
 
 
135
  "patch_size": 14,
 
 
 
136
  "qk_normalization": false,
137
  "qkv_bias": true,
 
 
138
  "return_dict": true,
 
 
 
 
 
 
 
 
 
 
 
139
  "torch_dtype": "bfloat16",
 
140
  "transformers_version": "4.37.2",
 
141
  "use_bfloat16": true,
142
  "use_flash_attn": true
143
  }
configuration_intern_vit.py CHANGED
@@ -3,6 +3,7 @@
3
  # Copyright (c) 2024 OpenGVLab
4
  # Licensed under The MIT License [see LICENSE for details]
5
  # --------------------------------------------------------
 
6
  import os
7
  from typing import Union
8
 
 
3
  # Copyright (c) 2024 OpenGVLab
4
  # Licensed under The MIT License [see LICENSE for details]
5
  # --------------------------------------------------------
6
+
7
  import os
8
  from typing import Union
9
 
configuration_internvl_chat.py CHANGED
@@ -39,20 +39,20 @@ class InternVLChatConfig(PretrainedConfig):
39
  super().__init__(**kwargs)
40
 
41
  if vision_config is None:
42
- vision_config = {}
43
  logger.info('vision_config is None. Initializing the InternVisionConfig with default values.')
44
 
45
  if llm_config is None:
46
- llm_config = {}
47
  logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
48
 
49
  self.vision_config = InternVisionConfig(**vision_config)
50
- if llm_config['architectures'][0] == 'LlamaForCausalLM':
51
  self.llm_config = LlamaConfig(**llm_config)
52
- elif llm_config['architectures'][0] == 'InternLM2ForCausalLM':
53
  self.llm_config = InternLM2Config(**llm_config)
54
  else:
55
- raise ValueError('Unsupported architecture: {}'.format(llm_config['architectures'][0]))
56
  self.use_backbone_lora = use_backbone_lora
57
  self.use_llm_lora = use_llm_lora
58
  self.select_layer = select_layer
 
39
  super().__init__(**kwargs)
40
 
41
  if vision_config is None:
42
+ vision_config = {'architectures': ['InternVisionModel']}
43
  logger.info('vision_config is None. Initializing the InternVisionConfig with default values.')
44
 
45
  if llm_config is None:
46
+ llm_config = {'architectures': ['InternLM2ForCausalLM']}
47
  logger.info('llm_config is None. Initializing the LlamaConfig config with default values (`LlamaConfig`).')
48
 
49
  self.vision_config = InternVisionConfig(**vision_config)
50
+ if llm_config.get('architectures')[0] == 'LlamaForCausalLM':
51
  self.llm_config = LlamaConfig(**llm_config)
52
+ elif llm_config.get('architectures')[0] == 'InternLM2ForCausalLM':
53
  self.llm_config = InternLM2Config(**llm_config)
54
  else:
55
+ raise ValueError('Unsupported architecture: {}'.format(llm_config.get('architectures')[0]))
56
  self.use_backbone_lora = use_backbone_lora
57
  self.use_llm_lora = use_llm_lora
58
  self.select_layer = select_layer
conversation.py CHANGED
@@ -3,11 +3,13 @@ Conversation prompt templates.
3
 
4
  We kindly request that you import fastchat instead of copying this file if you wish to use it.
5
  If you have changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
 
 
6
  """
7
 
8
  import dataclasses
9
  from enum import IntEnum, auto
10
- from typing import Any, Dict, List, Tuple, Union
11
 
12
 
13
  class SeparatorStyle(IntEnum):
@@ -344,12 +346,6 @@ register_conv_template(
344
  roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
345
  sep_style=SeparatorStyle.MPT,
346
  sep='<|im_end|>',
347
- stop_token_ids=[
348
- 2,
349
- 6,
350
- 7,
351
- 8,
352
- ],
353
  stop_str='<|endoftext|>',
354
  )
355
  )
@@ -365,11 +361,6 @@ register_conv_template(
365
  roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
366
  sep_style=SeparatorStyle.MPT,
367
  sep='<|im_end|>',
368
- stop_token_ids=[
369
- 2,
370
- 92543,
371
- 92542
372
- ]
373
  )
374
  )
375
 
@@ -384,10 +375,17 @@ register_conv_template(
384
  roles=('<|user|>\n', '<|assistant|>\n'),
385
  sep_style=SeparatorStyle.MPT,
386
  sep='<|end|>',
387
- stop_token_ids=[
388
- 2,
389
- 32000,
390
- 32007
391
- ]
 
 
 
 
 
 
 
392
  )
393
  )
 
3
 
4
  We kindly request that you import fastchat instead of copying this file if you wish to use it.
5
  If you have changes in mind, please contribute back so the community can benefit collectively and continue to maintain these valuable templates.
6
+
7
+ Modified from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
8
  """
9
 
10
  import dataclasses
11
  from enum import IntEnum, auto
12
+ from typing import Dict, List, Tuple, Union
13
 
14
 
15
  class SeparatorStyle(IntEnum):
 
346
  roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
347
  sep_style=SeparatorStyle.MPT,
348
  sep='<|im_end|>',
 
 
 
 
 
 
349
  stop_str='<|endoftext|>',
350
  )
351
  )
 
361
  roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
362
  sep_style=SeparatorStyle.MPT,
363
  sep='<|im_end|>',
 
 
 
 
 
364
  )
365
  )
366
 
 
375
  roles=('<|user|>\n', '<|assistant|>\n'),
376
  sep_style=SeparatorStyle.MPT,
377
  sep='<|end|>',
378
+ )
379
+ )
380
+
381
+
382
+ register_conv_template(
383
+ Conversation(
384
+ name='internvl2_5',
385
+ system_template='<|im_start|>system\n{system_message}',
386
+ system_message='你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。',
387
+ roles=('<|im_start|>user\n', '<|im_start|>assistant\n'),
388
+ sep_style=SeparatorStyle.MPT,
389
+ sep='<|im_end|>\n',
390
  )
391
  )
generation_config.json CHANGED
@@ -1,4 +1,8 @@
1
  {
2
  "_from_model_config": true,
3
- "transformers_version": "4.37.2"
 
 
 
 
4
  }
 
1
  {
2
  "_from_model_config": true,
3
+ "transformers_version": "4.37.2",
4
+ "eos_token_id": [
5
+ 92542,
6
+ 92543
7
+ ]
8
  }
modeling_intern_vit.py CHANGED
@@ -3,6 +3,7 @@
3
  # Copyright (c) 2024 OpenGVLab
4
  # Licensed under The MIT License [see LICENSE for details]
5
  # --------------------------------------------------------
 
6
  from typing import Optional, Tuple, Union
7
 
8
  import torch
 
3
  # Copyright (c) 2024 OpenGVLab
4
  # Licensed under The MIT License [see LICENSE for details]
5
  # --------------------------------------------------------
6
+
7
  from typing import Optional, Tuple, Union
8
 
9
  import torch
modeling_internvl_chat.py CHANGED
@@ -3,8 +3,9 @@
3
  # Copyright (c) 2024 OpenGVLab
4
  # Licensed under The MIT License [see LICENSE for details]
5
  # --------------------------------------------------------
 
6
  import warnings
7
- from typing import Any, List, Optional, Tuple, Union
8
 
9
  import torch.utils.checkpoint
10
  import transformers
@@ -35,13 +36,14 @@ def version_cmp(v1, v2, op='eq'):
35
  class InternVLChatModel(PreTrainedModel):
36
  config_class = InternVLChatConfig
37
  main_input_name = 'pixel_values'
 
38
  _supports_flash_attn_2 = True
39
  _no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer', 'InternLM2DecoderLayer']
40
 
41
  def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None, use_flash_attn=True):
42
  super().__init__(config)
43
 
44
- assert version_cmp(transformers.__version__, '4.36.2', 'ge')
45
  image_size = config.force_image_size or config.vision_config.image_size
46
  patch_size = config.vision_config.patch_size
47
  self.patch_size = patch_size
@@ -101,7 +103,7 @@ class InternVLChatModel(PreTrainedModel):
101
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
102
 
103
  image_flags = image_flags.squeeze(-1)
104
- input_embeds = self.language_model.get_input_embeddings()(input_ids)
105
 
106
  vit_embeds = self.extract_feature(pixel_values)
107
  vit_embeds = vit_embeds[image_flags == 1]
@@ -110,7 +112,7 @@ class InternVLChatModel(PreTrainedModel):
110
  B, N, C = input_embeds.shape
111
  input_embeds = input_embeds.reshape(B * N, C)
112
 
113
- if torch.distributed.get_rank() == 0:
114
  print(f'dynamic ViT batch size: {vit_batch_size}, images per sample: {vit_batch_size / B}, dynamic token length: {N}')
115
 
116
  input_ids = input_ids.reshape(B * N)
@@ -234,9 +236,9 @@ class InternVLChatModel(PreTrainedModel):
234
 
235
  tokenizer.padding_side = 'left'
236
  model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
237
- input_ids = model_inputs['input_ids'].cuda()
238
- attention_mask = model_inputs['attention_mask'].cuda()
239
- eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
240
  generation_config['eos_token_id'] = eos_token_id
241
  generation_output = self.generate(
242
  pixel_values=pixel_values,
@@ -245,7 +247,7 @@ class InternVLChatModel(PreTrainedModel):
245
  **generation_config
246
  )
247
  responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
248
- responses = [response.split(template.sep)[0].strip() for response in responses]
249
  return responses
250
 
251
  def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
@@ -264,7 +266,7 @@ class InternVLChatModel(PreTrainedModel):
264
 
265
  template = get_conv_template(self.template)
266
  template.system_message = self.system_message
267
- eos_token_id = tokenizer.convert_tokens_to_ids(template.sep)
268
 
269
  history = [] if history is None else history
270
  for (old_question, old_answer) in history:
@@ -283,8 +285,8 @@ class InternVLChatModel(PreTrainedModel):
283
  query = query.replace('<image>', image_tokens, 1)
284
 
285
  model_inputs = tokenizer(query, return_tensors='pt')
286
- input_ids = model_inputs['input_ids'].cuda()
287
- attention_mask = model_inputs['attention_mask'].cuda()
288
  generation_config['eos_token_id'] = eos_token_id
289
  generation_output = self.generate(
290
  pixel_values=pixel_values,
@@ -293,7 +295,7 @@ class InternVLChatModel(PreTrainedModel):
293
  **generation_config
294
  )
295
  response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
296
- response = response.split(template.sep)[0].strip()
297
  history.append((question, response))
298
  if return_history:
299
  return response, history
@@ -313,7 +315,6 @@ class InternVLChatModel(PreTrainedModel):
313
  visual_features: Optional[torch.FloatTensor] = None,
314
  generation_config: Optional[GenerationConfig] = None,
315
  output_hidden_states: Optional[bool] = None,
316
- return_dict: Optional[bool] = None,
317
  **generate_kwargs,
318
  ) -> torch.LongTensor:
319
 
@@ -341,7 +342,6 @@ class InternVLChatModel(PreTrainedModel):
341
  attention_mask=attention_mask,
342
  generation_config=generation_config,
343
  output_hidden_states=output_hidden_states,
344
- return_dict=return_dict,
345
  use_cache=True,
346
  **generate_kwargs,
347
  )
 
3
  # Copyright (c) 2024 OpenGVLab
4
  # Licensed under The MIT License [see LICENSE for details]
5
  # --------------------------------------------------------
6
+
7
  import warnings
8
+ from typing import List, Optional, Tuple, Union
9
 
10
  import torch.utils.checkpoint
11
  import transformers
 
36
  class InternVLChatModel(PreTrainedModel):
37
  config_class = InternVLChatConfig
38
  main_input_name = 'pixel_values'
39
+ base_model_prefix = 'language_model'
40
  _supports_flash_attn_2 = True
41
  _no_split_modules = ['InternVisionModel', 'LlamaDecoderLayer', 'InternLM2DecoderLayer']
42
 
43
  def __init__(self, config: InternVLChatConfig, vision_model=None, language_model=None, use_flash_attn=True):
44
  super().__init__(config)
45
 
46
+ assert version_cmp(transformers.__version__, '4.37.0', 'ge')
47
  image_size = config.force_image_size or config.vision_config.image_size
48
  patch_size = config.vision_config.patch_size
49
  self.patch_size = patch_size
 
103
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
104
 
105
  image_flags = image_flags.squeeze(-1)
106
+ input_embeds = self.language_model.get_input_embeddings()(input_ids).clone()
107
 
108
  vit_embeds = self.extract_feature(pixel_values)
109
  vit_embeds = vit_embeds[image_flags == 1]
 
112
  B, N, C = input_embeds.shape
113
  input_embeds = input_embeds.reshape(B * N, C)
114
 
115
+ if torch.distributed.is_initialized() and torch.distributed.get_rank() == 0:
116
  print(f'dynamic ViT batch size: {vit_batch_size}, images per sample: {vit_batch_size / B}, dynamic token length: {N}')
117
 
118
  input_ids = input_ids.reshape(B * N)
 
236
 
237
  tokenizer.padding_side = 'left'
238
  model_inputs = tokenizer(queries, return_tensors='pt', padding=True)
239
+ input_ids = model_inputs['input_ids'].to(self.device)
240
+ attention_mask = model_inputs['attention_mask'].to(self.device)
241
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
242
  generation_config['eos_token_id'] = eos_token_id
243
  generation_output = self.generate(
244
  pixel_values=pixel_values,
 
247
  **generation_config
248
  )
249
  responses = tokenizer.batch_decode(generation_output, skip_special_tokens=True)
250
+ responses = [response.split(template.sep.strip())[0].strip() for response in responses]
251
  return responses
252
 
253
  def chat(self, tokenizer, pixel_values, question, generation_config, history=None, return_history=False,
 
266
 
267
  template = get_conv_template(self.template)
268
  template.system_message = self.system_message
269
+ eos_token_id = tokenizer.convert_tokens_to_ids(template.sep.strip())
270
 
271
  history = [] if history is None else history
272
  for (old_question, old_answer) in history:
 
285
  query = query.replace('<image>', image_tokens, 1)
286
 
287
  model_inputs = tokenizer(query, return_tensors='pt')
288
+ input_ids = model_inputs['input_ids'].to(self.device)
289
+ attention_mask = model_inputs['attention_mask'].to(self.device)
290
  generation_config['eos_token_id'] = eos_token_id
291
  generation_output = self.generate(
292
  pixel_values=pixel_values,
 
295
  **generation_config
296
  )
297
  response = tokenizer.batch_decode(generation_output, skip_special_tokens=True)[0]
298
+ response = response.split(template.sep.strip())[0].strip()
299
  history.append((question, response))
300
  if return_history:
301
  return response, history
 
315
  visual_features: Optional[torch.FloatTensor] = None,
316
  generation_config: Optional[GenerationConfig] = None,
317
  output_hidden_states: Optional[bool] = None,
 
318
  **generate_kwargs,
319
  ) -> torch.LongTensor:
320
 
 
342
  attention_mask=attention_mask,
343
  generation_config=generation_config,
344
  output_hidden_states=output_hidden_states,
 
345
  use_cache=True,
346
  **generate_kwargs,
347
  )
preprocessor_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": 448,
3
+ "do_center_crop": true,
4
+ "do_normalize": true,
5
+ "do_resize": true,
6
+ "feature_extractor_type": "CLIPFeatureExtractor",
7
+ "image_mean": [
8
+ 0.485,
9
+ 0.456,
10
+ 0.406
11
+ ],
12
+ "image_std": [
13
+ 0.229,
14
+ 0.224,
15
+ 0.225
16
+ ],
17
+ "resample": 3,
18
+ "size": 448
19
+ }
tokenization_internlm2_fast.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) The InternLM team and The HuggingFace Inc. team. All rights reserved.
2
+ #
3
+ # This code is based on transformers/src/transformers/models/llama/tokenization_llama_fast.py
4
+ #
5
+ # Licensed under the Apache License, Version 2.0 (the "License");
6
+ # you may not use this file except in compliance with the License.
7
+ # You may obtain a copy of the License at
8
+ #
9
+ # http://www.apache.org/licenses/LICENSE-2.0
10
+ #
11
+ # Unless required by applicable law or agreed to in writing, software
12
+ # distributed under the License is distributed on an "AS IS" BASIS,
13
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ # See the License for the specific language governing permissions and
15
+ # limitations under the License.
16
+
17
+ """Tokenization Fast class for InternLM."""
18
+ import os
19
+ from shutil import copyfile
20
+ from typing import Any, Dict, Optional, Tuple
21
+
22
+ from tokenizers import Tokenizer, decoders, normalizers, processors
23
+ from tokenizers.models import BPE
24
+ from transformers.convert_slow_tokenizer import (SLOW_TO_FAST_CONVERTERS,
25
+ SentencePieceExtractor,
26
+ SpmConverter)
27
+ from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
28
+ from transformers.utils import logging
29
+
30
+ from .tokenization_internlm2 import InternLM2Tokenizer
31
+
32
+ logger = logging.get_logger(__name__)
33
+
34
+ VOCAB_FILES_NAMES = {'vocab_file': './tokenizer.model'}
35
+
36
+
37
+ # Modified from transformers.convert_slow_tokenizer.LlamaConverter
38
+ class InternLM2Converter(SpmConverter):
39
+ handle_byte_fallback = True
40
+
41
+ def vocab(self, proto):
42
+ vocab = [
43
+ ('<unk>', 0.0),
44
+ ('<s>', 0.0),
45
+ ('</s>', 0.0),
46
+ ]
47
+ vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
48
+ return vocab
49
+
50
+ def unk_id(self, proto):
51
+ unk_id = 0
52
+ return unk_id
53
+
54
+ def decoder(self, replacement, add_prefix_space):
55
+ return decoders.Sequence(
56
+ [
57
+ decoders.Replace('▁', ' '),
58
+ decoders.ByteFallback(),
59
+ decoders.Fuse(),
60
+ decoders.Strip(content=' ', left=1),
61
+ ]
62
+ )
63
+
64
+ def tokenizer(self, proto):
65
+ model_type = proto.trainer_spec.model_type
66
+ vocab_scores = self.vocab(proto)
67
+ # special tokens
68
+ added_tokens = self.original_tokenizer.added_tokens_decoder
69
+ for i in range(len(vocab_scores)):
70
+ piece, score = vocab_scores[i]
71
+ if i in added_tokens:
72
+ vocab_scores[i] = (added_tokens[i].content, score)
73
+ if model_type == 1:
74
+ raise RuntimeError('InternLM2 is supposed to be a BPE model!')
75
+
76
+ elif model_type == 2:
77
+ _, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
78
+ bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
79
+ tokenizer = Tokenizer(
80
+ BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
81
+ )
82
+ tokenizer.add_special_tokens(
83
+ [ added_token for index, added_token in added_tokens.items()]
84
+ )
85
+ else:
86
+ raise Exception(
87
+ "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
88
+ )
89
+
90
+ return tokenizer
91
+
92
+ def normalizer(self, proto):
93
+ normalizers_list = []
94
+ if proto.normalizer_spec.add_dummy_prefix:
95
+ normalizers_list.append(normalizers.Prepend(prepend='▁'))
96
+ normalizers_list.append(normalizers.Replace(pattern=' ', content='▁'))
97
+ return normalizers.Sequence(normalizers_list)
98
+
99
+ def pre_tokenizer(self, replacement, add_prefix_space):
100
+ return None
101
+
102
+
103
+ SLOW_TO_FAST_CONVERTERS['InternLM2Tokenizer'] = InternLM2Converter
104
+
105
+
106
+ # Modified from transformers.model.llama.tokenization_llama_fast.LlamaTokenizerFast -> InternLM2TokenizerFast
107
+ class InternLM2TokenizerFast(PreTrainedTokenizerFast):
108
+ vocab_files_names = VOCAB_FILES_NAMES
109
+ slow_tokenizer_class = InternLM2Tokenizer
110
+ padding_side = 'left'
111
+ model_input_names = ['input_ids', 'attention_mask']
112
+ _auto_class = 'AutoTokenizer'
113
+
114
+ def __init__(
115
+ self,
116
+ vocab_file,
117
+ unk_token='<unk>',
118
+ bos_token='<s>',
119
+ eos_token='</s>',
120
+ pad_token='</s>',
121
+ sp_model_kwargs: Optional[Dict[str, Any]] = None,
122
+ add_bos_token=True,
123
+ add_eos_token=False,
124
+ decode_with_prefix_space=False,
125
+ clean_up_tokenization_spaces=False,
126
+ **kwargs,
127
+ ):
128
+ super().__init__(
129
+ vocab_file=vocab_file,
130
+ unk_token=unk_token,
131
+ bos_token=bos_token,
132
+ eos_token=eos_token,
133
+ pad_token=pad_token,
134
+ sp_model_kwargs=sp_model_kwargs,
135
+ add_bos_token=add_bos_token,
136
+ add_eos_token=add_eos_token,
137
+ decode_with_prefix_space=decode_with_prefix_space,
138
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
139
+ **kwargs,
140
+ )
141
+ self._add_bos_token = add_bos_token
142
+ self._add_eos_token = add_eos_token
143
+ self.update_post_processor()
144
+ self.vocab_file = vocab_file
145
+
146
+ @property
147
+ def can_save_slow_tokenizer(self) -> bool:
148
+ return os.path.isfile(self.vocab_file) if self.vocab_file else False
149
+
150
+ def update_post_processor(self):
151
+ """
152
+ Updates the underlying post processor with the current `bos_token` and `eos_token`.
153
+ """
154
+ bos = self.bos_token
155
+ bos_token_id = self.bos_token_id
156
+ if bos is None and self.add_bos_token:
157
+ raise ValueError('add_bos_token = True but bos_token = None')
158
+
159
+ eos = self.eos_token
160
+ eos_token_id = self.eos_token_id
161
+ if eos is None and self.add_eos_token:
162
+ raise ValueError('add_eos_token = True but eos_token = None')
163
+
164
+ single = f"{(bos+':0 ') if self.add_bos_token else ''}$A:0{(' '+eos+':0') if self.add_eos_token else ''}"
165
+ pair = f"{single}{(' '+bos+':1') if self.add_bos_token else ''} $B:1{(' '+eos+':1') if self.add_eos_token else ''}"
166
+
167
+ special_tokens = []
168
+ if self.add_bos_token:
169
+ special_tokens.append((bos, bos_token_id))
170
+ if self.add_eos_token:
171
+ special_tokens.append((eos, eos_token_id))
172
+ self._tokenizer.post_processor = processors.TemplateProcessing(
173
+ single=single, pair=pair, special_tokens=special_tokens
174
+ )
175
+
176
+ @property
177
+ def add_eos_token(self):
178
+ return self._add_eos_token
179
+
180
+ @property
181
+ def add_bos_token(self):
182
+ return self._add_bos_token
183
+
184
+ @add_eos_token.setter
185
+ def add_eos_token(self, value):
186
+ self._add_eos_token = value
187
+ self.update_post_processor()
188
+
189
+ @add_bos_token.setter
190
+ def add_bos_token(self, value):
191
+ self._add_bos_token = value
192
+ self.update_post_processor()
193
+
194
+ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
195
+ if not self.can_save_slow_tokenizer:
196
+ raise ValueError(
197
+ 'Your fast tokenizer does not have the necessary information to save the vocabulary for a slow '
198
+ 'tokenizer.'
199
+ )
200
+
201
+ if not os.path.isdir(save_directory):
202
+ logger.error(f'Vocabulary path ({save_directory}) should be a directory')
203
+ return
204
+ out_vocab_file = os.path.join(
205
+ save_directory, (filename_prefix + '-' if filename_prefix else '') + VOCAB_FILES_NAMES['vocab_file']
206
+ )
207
+
208
+ if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
209
+ copyfile(self.vocab_file, out_vocab_file)
210
+
211
+ return (out_vocab_file,)