BAAI
/

BoyaWu10 commited on
Commit
e38065d
1 Parent(s): 0a4032c

Update quickstart

Browse files
Files changed (4) hide show
  1. README.md +55 -0
  2. config.json +2 -2
  3. configuration_bunny_phi.py +253 -0
  4. modeling_bunny_phi.py +0 -0
README.md CHANGED
@@ -20,6 +20,61 @@ More details about this model can be found in [GitHub](https://github.com/BAAI-D
20
 
21
  ![comparison](comparison.png)
22
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  # License
24
  This project utilizes certain datasets and checkpoints that are subject to their respective original licenses. Users must comply with all terms and conditions of these original licenses.
25
  The content of this project itself is licensed under the Apache license 2.0.
 
20
 
21
  ![comparison](comparison.png)
22
 
23
+ # Quickstart
24
+
25
+ Here we show a code snippet to show you how to use the model with transformers:
26
+
27
+ ```python
28
+ import torch
29
+ import transformers
30
+ from transformers import AutoModelForCausalLM, AutoTokenizer
31
+ from PIL import Image
32
+ import warnings
33
+
34
+ # disable some warnings
35
+ transformers.logging.set_verbosity_error()
36
+ transformers.logging.disable_progress_bar()
37
+ warnings.filterwarnings('ignore')
38
+
39
+ # set device
40
+ torch.set_default_device('cpu') # or 'cuda'
41
+
42
+ # create model
43
+ model = AutoModelForCausalLM.from_pretrained(
44
+ 'BAAI/bunny-phi-2-siglip',
45
+ torch_dtype=torch.float16,
46
+ device_map='auto',
47
+ trust_remote_code=True)
48
+ tokenizer = AutoTokenizer.from_pretrained(
49
+ 'BAAI/bunny-phi-2-siglip',
50
+ trust_remote_code=True)
51
+
52
+ # text prompt
53
+ prompt = 'Why is the image funny?'
54
+ text = f"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <image>\n{prompt} ASSISTANT:"
55
+ text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
56
+ input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
57
+
58
+ # image
59
+ image = Image.open('example_2.png')
60
+ image_tensor = model.process_images([image], model.config).to(dtype=model.dtype)
61
+
62
+ # generate
63
+ output_ids = model.generate(
64
+ input_ids,
65
+ images=image_tensor,
66
+ max_new_tokens=100,
67
+ use_cache=True)[0]
68
+
69
+ print(tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip())
70
+ ```
71
+
72
+ Before running the snippet, you need to install the following dependencies:
73
+
74
+ ```shell
75
+ pip install torch transformers accelerate
76
+ ```
77
+
78
  # License
79
  This project utilizes certain datasets and checkpoints that are subject to their respective original licenses. Users must comply with all terms and conditions of these original licenses.
80
  The content of this project itself is licensed under the Apache license 2.0.
config.json CHANGED
@@ -5,8 +5,8 @@
5
  ],
6
  "attention_dropout": 0.0,
7
  "auto_map": {
8
- "AutoConfig": "configuration_phi.PhiConfig",
9
- "AutoModelForCausalLM": "modeling_phi.PhiForCausalLM"
10
  },
11
  "bos_token_id": 50256,
12
  "embd_pdrop": 0.0,
 
5
  ],
6
  "attention_dropout": 0.0,
7
  "auto_map": {
8
+ "AutoConfig": "configuration_bunny_phi.BunnyPhiConfig",
9
+ "AutoModelForCausalLM": "modeling_bunny_phi.BunnyPhiForCausalLM"
10
  },
11
  "bos_token_id": 50256,
12
  "embd_pdrop": 0.0,
configuration_bunny_phi.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # coding=utf-8
2
+ # Copyright 2023 Microsoft and the HuggingFace Inc. team. All rights reserved.
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ """ Phi model configuration"""
17
+
18
+ from transformers.configuration_utils import PretrainedConfig
19
+ from transformers.utils import logging
20
+
21
+ logger = logging.get_logger(__name__)
22
+
23
+ PHI_PRETRAINED_CONFIG_ARCHIVE_MAP = {
24
+ "microsoft/phi-1": "https://huggingface.co/microsoft/phi-1/resolve/main/config.json",
25
+ "microsoft/phi-1_5": "https://huggingface.co/microsoft/phi-1_5/resolve/main/config.json",
26
+ "microsoft/phi-2": "https://huggingface.co/microsoft/phi-2/resolve/main/config.json",
27
+ }
28
+
29
+
30
+ class PhiConfig(PretrainedConfig):
31
+ r"""
32
+ This is the configuration class to store the configuration of a [`PhiModel`]. It is used to instantiate an Phi
33
+ model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
34
+ defaults will yield a similar configuration to that of the Phi
35
+ [microsoft/phi-1](https://huggingface.co/microsoft/phi-1).
36
+
37
+ Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
38
+ documentation from [`PretrainedConfig`] for more information.
39
+
40
+ Args:
41
+ vocab_size (`int`, *optional*, defaults to 51200):
42
+ Vocabulary size of the Phi model. Defines the number of different tokens that can be represented by the
43
+ `inputs_ids` passed when calling [`PhiModel`].
44
+ hidden_size (`int`, *optional*, defaults to 2048):
45
+ Dimension of the hidden representations.
46
+ intermediate_size (`int`, *optional*, defaults to 8192):
47
+ Dimension of the MLP representations.
48
+ num_hidden_layers (`int`, *optional*, defaults to 24):
49
+ Number of hidden layers in the Transformer decoder.
50
+ num_attention_heads (`int`, *optional*, defaults to 32):
51
+ Number of attention heads for each attention layer in the Transformer decoder.
52
+ num_key_value_heads (`int`, *optional*):
53
+ This is the number of key_value heads that should be used to implement Grouped Query Attention. If
54
+ `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
55
+ `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
56
+ converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
57
+ by meanpooling all the original heads within that group. For more details checkout [this
58
+ paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
59
+ `num_attention_heads`.
60
+ resid_pdrop (`float`, *optional*, defaults to 0.0):
61
+ Dropout probability for mlp outputs.
62
+ embd_pdrop (`int`, *optional*, defaults to 0.0):
63
+ The dropout ratio for the embeddings.
64
+ attention_dropout (`float`, *optional*, defaults to 0.0):
65
+ The dropout ratio after computing the attention scores.
66
+ hidden_act (`str` or `function`, *optional*, defaults to `"gelu_new"`):
67
+ The non-linear activation function (function or string) in the decoder.
68
+ max_position_embeddings (`int`, *optional*, defaults to 2048):
69
+ The maximum sequence length that this model might ever be used with. Phi-1 and Phi-1.5 supports up to 2048
70
+ tokens.
71
+ initializer_range (`float`, *optional*, defaults to 0.02):
72
+ The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
73
+ layer_norm_eps (`float`, *optional*, defaults to 1e-05):
74
+ The epsilon used by the rms normalization layers.
75
+ use_cache (`bool`, *optional*, defaults to `True`):
76
+ Whether or not the model should return the last key/values attentions (not used by all models). Only
77
+ relevant if `config.is_decoder=True`. Whether to tie weight embeddings or not.
78
+ tie_word_embeddings (`bool`, *optional*, defaults to `False`):
79
+ Whether to tie weight embeddings
80
+ rope_theta (`float`, *optional*, defaults to 10000.0):
81
+ The base period of the RoPE embeddings.
82
+ rope_scaling (`Dict`, *optional*):
83
+ Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
84
+ strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
85
+ is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
86
+ `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
87
+ these scaling strategies behave:
88
+ https://www.reddit.com/r/LocalPersimmon/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This
89
+ is an experimental feature, subject to breaking API changes in future versions.
90
+ partial_rotary_factor (`float`, *optional*, defaults to 0.5):
91
+ Percentage of the query and keys which will have rotary embedding.
92
+ qk_layernorm (`bool`, *optional*, defaults to `False`):
93
+ Whether or not to normalize the Queries and Keys after projecting the hidden states.
94
+ bos_token_id (`int`, *optional*, defaults to 1):
95
+ Denotes beginning of sequences token id.
96
+ eos_token_id (`int`, *optional*, defaults to 2):
97
+ Denotes end of sequences token id.
98
+
99
+ Example:
100
+
101
+ ```python
102
+ >>> from transformers import PhiModel, PhiConfig
103
+
104
+ >>> # Initializing a Phi-1 style configuration
105
+ >>> configuration = PhiConfig.from_pretrained("microsoft/phi-1")
106
+
107
+ >>> # Initializing a model from the configuration
108
+ >>> model = PhiModel(configuration)
109
+
110
+ >>> # Accessing the model configuration
111
+ >>> configuration = model.config
112
+ ```"""
113
+
114
+ model_type = "phi"
115
+ keys_to_ignore_at_inference = ["past_key_values"]
116
+
117
+ def __init__(
118
+ self,
119
+ vocab_size=51200,
120
+ hidden_size=2048,
121
+ intermediate_size=8192,
122
+ num_hidden_layers=24,
123
+ num_attention_heads=32,
124
+ num_key_value_heads=None,
125
+ resid_pdrop=0.0,
126
+ embd_pdrop=0.0,
127
+ attention_dropout=0.0,
128
+ hidden_act="gelu_new",
129
+ max_position_embeddings=2048,
130
+ initializer_range=0.02,
131
+ layer_norm_eps=1e-5,
132
+ use_cache=True,
133
+ tie_word_embeddings=False,
134
+ rope_theta=10000.0,
135
+ rope_scaling=None,
136
+ partial_rotary_factor=0.5,
137
+ qk_layernorm=False,
138
+ bos_token_id=1,
139
+ eos_token_id=2,
140
+ **kwargs,
141
+ ):
142
+ self.vocab_size = vocab_size
143
+ self.hidden_size = hidden_size
144
+ self.intermediate_size = intermediate_size
145
+ self.num_hidden_layers = num_hidden_layers
146
+ self.num_attention_heads = num_attention_heads
147
+
148
+ if num_key_value_heads is None:
149
+ num_key_value_heads = num_attention_heads
150
+
151
+ self.num_key_value_heads = num_key_value_heads
152
+ self.resid_pdrop = resid_pdrop
153
+ self.embd_pdrop = embd_pdrop
154
+ self.attention_dropout = attention_dropout
155
+ self.hidden_act = hidden_act
156
+ self.max_position_embeddings = max_position_embeddings
157
+ self.initializer_range = initializer_range
158
+ self.layer_norm_eps = layer_norm_eps
159
+ self.use_cache = use_cache
160
+ self.rope_theta = rope_theta
161
+ self.rope_scaling = rope_scaling
162
+ self.partial_rotary_factor = partial_rotary_factor
163
+ self.qk_layernorm = qk_layernorm
164
+ self._rope_scaling_validation()
165
+
166
+ super().__init__(
167
+ bos_token_id=bos_token_id,
168
+ eos_token_id=eos_token_id,
169
+ tie_word_embeddings=tie_word_embeddings,
170
+ **kwargs,
171
+ )
172
+
173
+ # Copied from transformers.models.llama.configuration_llama.LlamaConfig._rope_scaling_validation
174
+ def _rope_scaling_validation(self):
175
+ """
176
+ Validate the `rope_scaling` configuration.
177
+ """
178
+ if self.rope_scaling is None:
179
+ return
180
+
181
+ if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
182
+ raise ValueError(
183
+ "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
184
+ f"got {self.rope_scaling}"
185
+ )
186
+ rope_scaling_type = self.rope_scaling.get("type", None)
187
+ rope_scaling_factor = self.rope_scaling.get("factor", None)
188
+ if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
189
+ raise ValueError(
190
+ f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
191
+ )
192
+ if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
193
+ raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
194
+
195
+
196
+ from typing import Union
197
+ from transformers import PretrainedConfig
198
+ import os
199
+
200
+
201
+ class SigLipVisionConfig(PretrainedConfig):
202
+ model_type = "siglip_vision_model"
203
+
204
+ def __init__(
205
+ self,
206
+ hidden_size=1152,
207
+ image_mean=(0.5, 0.5, 0.5),
208
+ intermediate_size=4304,
209
+ num_hidden_layers=27,
210
+ num_attention_heads=16,
211
+ num_channels=3,
212
+ image_size=384,
213
+ patch_size=14,
214
+ hidden_act="gelu_pytorch_tanh",
215
+ layer_norm_eps=1e-6,
216
+ attention_dropout=0.0,
217
+ **kwargs,
218
+ ):
219
+ super().__init__(**kwargs)
220
+
221
+ self.hidden_size = hidden_size
222
+ self.intermediate_size = intermediate_size
223
+ self.num_hidden_layers = num_hidden_layers
224
+ self.num_attention_heads = num_attention_heads
225
+ self.num_channels = num_channels
226
+ self.patch_size = patch_size
227
+ self.image_size = image_size
228
+ self.attention_dropout = attention_dropout
229
+ self.layer_norm_eps = layer_norm_eps
230
+ self.hidden_act = hidden_act
231
+ self.image_mean = image_mean
232
+
233
+ @classmethod
234
+ def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
235
+ cls._set_token_in_kwargs(kwargs)
236
+
237
+ config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
238
+
239
+ # get the vision config dict if we are loading from SigLipConfig
240
+ if config_dict.get("model_type") == "siglip":
241
+ config_dict = config_dict["vision_config"]
242
+
243
+ if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
244
+ logger.warning(
245
+ f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
246
+ f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
247
+ )
248
+
249
+ return cls.from_dict(config_dict, **kwargs)
250
+
251
+
252
+ class BunnyPhiConfig(PhiConfig):
253
+ model_type = "bunny-phi"
modeling_bunny_phi.py ADDED
The diff for this file is too large to render. See raw diff