benchang1110 commited on
Commit
4b2523c
·
verified ·
1 Parent(s): fbeb161

Upload processor

Browse files
Files changed (1) hide show
  1. processing_taivisionlm.py +408 -107
processing_taivisionlm.py CHANGED
@@ -1,27 +1,330 @@
1
- """
2
- Processor class for TaiVisionLM.
3
- """
4
- import transformers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  import logging
6
  from typing import List, Optional, Union
7
 
8
  from transformers.feature_extraction_utils import BatchFeature
9
  from transformers.image_utils import ImageInput, is_valid_image
10
  from transformers.processing_utils import ProcessorMixin
11
- from transformers.tokenization_utils import (
12
  AddedToken,
13
  PaddingStrategy,
14
  PreTokenizedInput,
15
  TextInput,
16
- TruncationStrategy,
17
  )
18
  from transformers.utils import TensorType
19
- from .configuration_taivisionlm import TaiVisionLMConfig
20
 
21
  logger = logging.getLogger(__name__)
22
 
23
  IMAGE_TOKEN = "<image>"
24
 
 
25
  # Copied from transformers.models.idefics2.processing_idefics2.is_url
26
  def is_url(val) -> bool:
27
  return isinstance(val, str) and val.startswith("http")
@@ -31,34 +334,38 @@ def is_url(val) -> bool:
31
  def is_image_or_image_url(elem):
32
  return is_url(elem) or is_valid_image(elem)
33
 
34
- # Copied from transformers.models.paligemma.processing_paligemma._is_str_or_image
35
  def _is_str_or_image(elem):
36
  return isinstance(elem, (str)) or is_image_or_image_url(elem)
37
 
38
 
39
- def build_string_from_input(image_seq_len, image_token):
40
  """
41
  Builds a string from the input prompt and image tokens.
42
  For example, for the call:
43
  build_string_from_input(
 
 
44
  image_seq_len=3,
45
  image_token="<im>",
46
  )
47
  The output will be:
48
- "<im><im><im>"
49
  Args:
 
 
50
  image_seq_len (`int`): The length of the image sequence.
51
  image_token (`str`): The image token.
52
  """
53
- return f"{image_token * image_seq_len}"
54
 
55
 
56
  class TaiVisionProcessor(ProcessorMixin):
57
  r"""
58
- Constructs a TraVision processor which wraps a SigLIP image processor and a GPT2 tokenizer into a single processor.
59
 
60
- [`TaiVisionProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`LlamaTokenizerFast`]. See the
61
- [`~TaiVisionProcessor.__call__`] and [`~TaiVisionProcessor.decode`] for more information.
62
 
63
  Args:
64
  image_processor ([`SiglipImageProcessor`], *optional*):
@@ -98,11 +405,12 @@ class TaiVisionProcessor(ProcessorMixin):
98
  tokenizer.add_eos_token = False
99
 
100
  super().__init__(image_processor, tokenizer, chat_template=chat_template)
101
-
102
  def __call__(
103
  self,
104
- prompts: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
105
  images: ImageInput = None,
 
106
  padding: Union[bool, str, PaddingStrategy] = False,
107
  truncation: Union[bool, str, TruncationStrategy] = None,
108
  max_length=None,
@@ -120,30 +428,45 @@ class TaiVisionProcessor(ProcessorMixin):
120
  do_thumbnail: bool = None,
121
  do_align_long_axis: bool = None,
122
  do_rescale: bool = None,
123
- labels: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
124
  ) -> BatchFeature:
125
  """
126
  Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
127
- and `kwargs` arguments to GPT2TokenizerFast's [`~GPT2TokenizerFast.__call__`] if `text` is not `None` to encode
128
  the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
129
  SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
130
  of the above two methods for more information.
131
 
132
- The usage for TraVisionLM fine-tuning preparation follows a standard 4D causal mask where only the prompt and label tokens
133
- are attended in an auto-regressive manner. The label in `text` are to be passed separately to the __call__ function and
134
- will be placed after the prompt, which is the instruction to steer the model generation.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
135
 
136
  Args:
137
- prompts (`str`, `List[str]`, `List[List[str]]`):
138
  The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
139
  (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
140
  `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
141
  images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
142
  The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
143
  tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
144
- number of channels, H and W are \image height and width.
145
- tokenize_newline_separately (`bool`, defaults to `False`):
146
- Adds a separately tokenized '\n' at the end of the prompt.
147
  padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
148
  Select a strategy to pad the returned sequences (according to the model's padding side and padding
149
  index) among:
@@ -164,76 +487,56 @@ class TaiVisionProcessor(ProcessorMixin):
164
  - `'pt'`: Return PyTorch `torch.Tensor` objects.
165
  - `'np'`: Return NumPy `np.ndarray` objects.
166
  - `'jax'`: Return JAX `jnp.ndarray` objects.
167
- labels (`str`, `List[str]`, `List[List[str]]`):
168
- The label or batch of labels to be encoded. Only necessary for training.
169
- text (`str`, `List[str]`, `List[List[str]]`):
170
- The text or batch of text to be encoded. If provided, the prompt and label should be
171
 
172
  Returns:
173
  [`BatchFeature`]: A [`BatchFeature`] with the following fields:
174
 
175
- - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `label`
176
- is provided, the `input_ids` will also contain the label input ids.
177
  - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
178
  `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
179
  `None`).
180
  - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
181
- - **labels** -- Labels compatible with training if `label` is not None
182
  """
183
 
184
- # return_token_type_ids = True if labels is not None else False
185
- return_token_type_ids = True
186
 
187
  if images is None:
188
- raise ValueError("`images` are expected as arguments to a `TraVisionProcessor` instance.")
189
-
190
- images = [images] if not isinstance(images, list) else images
191
-
192
- if prompts is None:
193
  logger.warning_once(
194
- "You are using TaiVisionLM without a text prefix. It will perform as a picture-captioning model."
195
  )
196
- prompts = "描述這張圖片" # default prompt if it is not provided as an argument
197
- if len(images) != 1:
198
- prompts = [prompts] * len(images)
199
-
200
- if isinstance(prompts, List) and isinstance(images, List):
201
  if len(images) < len(text):
202
  raise ValueError(
203
- f"Received {len(images)} images for {len(prompts)} prompts. Each prompt should be associated with an image."
204
  )
205
- if _is_str_or_image(prompts):
206
- prompts = [prompts]
207
- elif isinstance(prompts, list) and _is_str_or_image(prompts[0]):
208
  pass
209
-
210
- # add \n after image tokens
211
- prompts = [f"\n<|user|>\n{prompt}{self.tokenizer.eos_token}\n" for prompt in prompts]
212
- # TODO: tokenize the prompt twice, and check if the prompt is too long
213
- prompt_length = [len(self.tokenizer.tokenize(prompt)) + self.image_seq_length for prompt in prompts]
214
-
215
-
216
- if labels is not None:
217
- if _is_str_or_image(labels):
218
- labels = [labels] # convert it to list if it is a string
219
- labels = [f"<|assistant|>\n{label}{self.tokenizer.eos_token}" for label in labels]
220
 
221
- text = [f"{prompt}{label}" for prompt, label in zip(prompts, labels)]
222
-
223
- else:
224
- text = prompts
225
-
226
- assert len(images) == len(text), "The number of images and text should be the same."
227
-
228
  input_strings = [
229
  build_string_from_input(
 
 
230
  image_seq_len=self.image_seq_length,
231
  image_token=IMAGE_TOKEN,
232
- )
233
- for _ in text
234
  ]
235
-
236
- # this will do some image processing, like resizing, normalizing, etc.
237
  pixel_values = self.image_processor(
238
  images,
239
  do_resize=do_resize,
@@ -250,10 +553,9 @@ class TaiVisionProcessor(ProcessorMixin):
250
  if max_length is not None:
251
  max_length += self.image_seq_length # max_length has to account for the image tokens
252
 
253
- # modify the token_type_ids here by text_pair ?
254
  inputs = self.tokenizer(
255
  input_strings,
256
- text_pair=labels,
257
  return_tensors=return_tensors,
258
  padding=padding,
259
  max_length=max_length,
@@ -263,59 +565,58 @@ class TaiVisionProcessor(ProcessorMixin):
263
 
264
  return_data = {**inputs, "pixel_values": pixel_values}
265
 
266
- # we are doing training, so we need to return the labels
267
- if labels is not None:
268
- # fill the labels with -100 where we don't have to compute the loss
269
- # mask the padding part
270
- labels = inputs["input_ids"].masked_fill(inputs["attention_mask"] == 0, -100)
271
- # mask the image + prompt part, so that we don't train the model to predict the image tokens
272
- import torch
273
- prompt_length_tensor = torch.tensor(prompt_length)
274
- labels = labels.masked_fill(torch.arange(labels.size(1)).unsqueeze(0) < prompt_length_tensor.unsqueeze(1), -100)
275
  return_data.update({"labels": labels})
276
-
277
  return BatchFeature(data=return_data)
278
 
279
- # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->GPT2
280
  def batch_decode(self, *args, **kwargs):
281
  """
282
- This method forwards all its arguments to GPT2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
283
  refer to the docstring of this method for more information.
284
  """
285
  return self.tokenizer.batch_decode(*args, **kwargs)
286
 
287
- # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->GPT2
288
  def decode(self, *args, **kwargs):
289
  """
290
- This method forwards all its arguments to GPT2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
291
  the docstring of this method for more information.
292
  """
293
  return self.tokenizer.decode(*args, **kwargs)
294
 
295
  @property
296
- # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->TraVision
297
  def model_input_names(self):
298
  tokenizer_input_names = self.tokenizer.model_input_names
299
  image_processor_input_names = self.image_processor.model_input_names
300
  return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
301
 
302
-
303
- # if __name__ == '__main__':
304
- # config = TaiVisionLMConfig.from_pretrained("./")
305
- # preprocessor = transformers.SiglipImageProcessor.from_pretrained("google/siglip-base-patch16-224")
306
- # preprocessor.image_seq_length = config.num_image_tokens
307
- # tokenizer = transformers.AutoTokenizer.from_pretrained("benchang1110/Taiwan-tinyllama-v1.0-chat")
308
- # processor = TaiVisionProcessor(tokenizer=tokenizer, image_processor=preprocessor)
309
- # processor.save_pretrained("./")
310
 
311
- # from PIL import Image
312
- # import requests
313
- # processor = TaiVisionProcessor.from_pretrained("./")
314
- # url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"
315
- # image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
316
- # prompt = "Hello< what is your name?"
317
- # label = "I am fine, thank you."
318
- # inputs = processor(prompts=prompt, labels=label,images=image, return_tensors="pt",padding="max_length",max_length=512)
319
- # for key, value in inputs.items():
320
- # print(f"{key}: {value}")
321
- # print(processor.decode(inputs.input_ids.tolist()[0]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # """
2
+ # Processor class for TaiVisionLM.
3
+ # """
4
+ # import transformers
5
+ # import logging
6
+ # from typing import List, Optional, Union
7
+
8
+ # from transformers.feature_extraction_utils import BatchFeature
9
+ # from transformers.image_utils import ImageInput, is_valid_image
10
+ # from transformers.processing_utils import ProcessorMixin
11
+ # from transformers.tokenization_utils import (
12
+ # AddedToken,
13
+ # PaddingStrategy,
14
+ # PreTokenizedInput,
15
+ # TextInput,
16
+ # TruncationStrategy,
17
+ # )
18
+ # from transformers.utils import TensorType
19
+ # from .configuration_taivisionlm import TaiVisionLMConfig
20
+
21
+ # logger = logging.getLogger(__name__)
22
+
23
+ # IMAGE_TOKEN = "<image>"
24
+
25
+ # # Copied from transformers.models.idefics2.processing_idefics2.is_url
26
+ # def is_url(val) -> bool:
27
+ # return isinstance(val, str) and val.startswith("http")
28
+
29
+
30
+ # # Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url
31
+ # def is_image_or_image_url(elem):
32
+ # return is_url(elem) or is_valid_image(elem)
33
+
34
+ # # Copied from transformers.models.paligemma.processing_paligemma._is_str_or_image
35
+ # def _is_str_or_image(elem):
36
+ # return isinstance(elem, (str)) or is_image_or_image_url(elem)
37
+
38
+
39
+ # def build_string_from_input(image_seq_len, image_token):
40
+ # """
41
+ # Builds a string from the input prompt and image tokens.
42
+ # For example, for the call:
43
+ # build_string_from_input(
44
+ # image_seq_len=3,
45
+ # image_token="<im>",
46
+ # )
47
+ # The output will be:
48
+ # "<im><im><im>"
49
+ # Args:
50
+ # image_seq_len (`int`): The length of the image sequence.
51
+ # image_token (`str`): The image token.
52
+ # """
53
+ # return f"{image_token * image_seq_len}"
54
+
55
+
56
+ # class TaiVisionProcessor(ProcessorMixin):
57
+ # r"""
58
+ # Constructs a TraVision processor which wraps a SigLIP image processor and a GPT2 tokenizer into a single processor.
59
+
60
+ # [`TaiVisionProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`LlamaTokenizerFast`]. See the
61
+ # [`~TaiVisionProcessor.__call__`] and [`~TaiVisionProcessor.decode`] for more information.
62
+
63
+ # Args:
64
+ # image_processor ([`SiglipImageProcessor`], *optional*):
65
+ # The image processor is a required input.
66
+ # tokenizer ([`LlamaTokenizerFast`], *optional*):
67
+ # The tokenizer is a required input.
68
+ # chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
69
+ # in a chat into a tokenizable string.
70
+ # """
71
+
72
+ # attributes = ["image_processor", "tokenizer"]
73
+ # valid_kwargs = ["chat_template"]
74
+ # image_processor_class = "SiglipImageProcessor"
75
+ # tokenizer_class = ("LlamaTokenizer", "LlamaTokenizerFast")
76
+
77
+ # def __init__(
78
+ # self,
79
+ # image_processor=None,
80
+ # tokenizer=None,
81
+ # chat_template=None,
82
+ # **kwargs,
83
+ # ):
84
+ # if image_processor is None:
85
+ # raise ValueError("You need to specify an `image_processor`.")
86
+ # if tokenizer is None:
87
+ # raise ValueError("You need to specify a `tokenizer`.")
88
+ # if not hasattr(image_processor, "image_seq_length"):
89
+ # raise ValueError("Image processor is missing an `image_seq_length` attribute.")
90
+
91
+ # self.image_seq_length = image_processor.image_seq_length
92
+
93
+ # image_token = AddedToken(IMAGE_TOKEN, normalized=False, special=True)
94
+ # tokens_to_add = {"additional_special_tokens": [image_token]}
95
+ # tokenizer.add_special_tokens(tokens_to_add)
96
+ # self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
97
+ # tokenizer.add_bos_token = False
98
+ # tokenizer.add_eos_token = False
99
+
100
+ # super().__init__(image_processor, tokenizer, chat_template=chat_template)
101
+
102
+ # def __call__(
103
+ # self,
104
+ # prompts: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
105
+ # images: ImageInput = None,
106
+ # padding: Union[bool, str, PaddingStrategy] = False,
107
+ # truncation: Union[bool, str, TruncationStrategy] = None,
108
+ # max_length=None,
109
+ # return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
110
+ # do_resize: bool = None,
111
+ # do_normalize: bool = None,
112
+ # image_mean: Optional[Union[float, List[float]]] = None,
113
+ # image_std: Optional[Union[float, List[float]]] = None,
114
+ # data_format: Optional["ChannelDimension"] = "channels_first", # noqa: F821
115
+ # input_data_format: Optional[
116
+ # Union[str, "ChannelDimension"] # noqa: F821
117
+ # ] = None,
118
+ # resample: "PILImageResampling" = None, # noqa: F821
119
+ # do_convert_rgb: bool = None,
120
+ # do_thumbnail: bool = None,
121
+ # do_align_long_axis: bool = None,
122
+ # do_rescale: bool = None,
123
+ # labels: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
124
+ # ) -> BatchFeature:
125
+ # """
126
+ # Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
127
+ # and `kwargs` arguments to GPT2TokenizerFast's [`~GPT2TokenizerFast.__call__`] if `text` is not `None` to encode
128
+ # the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
129
+ # SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
130
+ # of the above two methods for more information.
131
+
132
+ # The usage for TraVisionLM fine-tuning preparation follows a standard 4D causal mask where only the prompt and label tokens
133
+ # are attended in an auto-regressive manner. The label in `text` are to be passed separately to the __call__ function and
134
+ # will be placed after the prompt, which is the instruction to steer the model generation.
135
+
136
+ # Args:
137
+ # prompts (`str`, `List[str]`, `List[List[str]]`):
138
+ # The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
139
+ # (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
140
+ # `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
141
+ # images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
142
+ # The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
143
+ # tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
144
+ # number of channels, H and W are \image height and width.
145
+ # tokenize_newline_separately (`bool`, defaults to `False`):
146
+ # Adds a separately tokenized '\n' at the end of the prompt.
147
+ # padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
148
+ # Select a strategy to pad the returned sequences (according to the model's padding side and padding
149
+ # index) among:
150
+ # - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
151
+ # sequence if provided).
152
+ # - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
153
+ # acceptable input length for the model if that argument is not provided.
154
+ # - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
155
+ # lengths).
156
+ # max_length (`int`, *optional*):
157
+ # Maximum length of the returned list and optionally padding length (see above).
158
+ # truncation (`bool`, *optional*):
159
+ # Activates truncation to cut input sequences longer than `max_length` to `max_length`.
160
+ # return_tensors (`str` or [`~utils.TensorType`], *optional*):
161
+ # If set, will return tensors of a particular framework. Acceptable values are:
162
+
163
+ # - `'tf'`: Return TensorFlow `tf.constant` objects.
164
+ # - `'pt'`: Return PyTorch `torch.Tensor` objects.
165
+ # - `'np'`: Return NumPy `np.ndarray` objects.
166
+ # - `'jax'`: Return JAX `jnp.ndarray` objects.
167
+ # labels (`str`, `List[str]`, `List[List[str]]`):
168
+ # The label or batch of labels to be encoded. Only necessary for training.
169
+ # text (`str`, `List[str]`, `List[List[str]]`):
170
+ # The text or batch of text to be encoded. If provided, the prompt and label should be
171
+
172
+ # Returns:
173
+ # [`BatchFeature`]: A [`BatchFeature`] with the following fields:
174
+
175
+ # - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `label`
176
+ # is provided, the `input_ids` will also contain the label input ids.
177
+ # - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
178
+ # `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
179
+ # `None`).
180
+ # - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
181
+ # - **labels** -- Labels compatible with training if `label` is not None
182
+ # """
183
+
184
+ # # return_token_type_ids = True if labels is not None else False
185
+ # return_token_type_ids = True
186
+
187
+ # if images is None:
188
+ # raise ValueError("`images` are expected as arguments to a `TraVisionProcessor` instance.")
189
+
190
+ # images = [images] if not isinstance(images, list) else images
191
+
192
+ # if prompts is None:
193
+ # logger.warning_once(
194
+ # "You are using TaiVisionLM without a text prefix. It will perform as a picture-captioning model."
195
+ # )
196
+ # prompts = "描述這張圖片" # default prompt if it is not provided as an argument
197
+ # if len(images) != 1:
198
+ # prompts = [prompts] * len(images)
199
+
200
+ # if isinstance(prompts, List) and isinstance(images, List):
201
+ # if len(images) < len(text):
202
+ # raise ValueError(
203
+ # f"Received {len(images)} images for {len(prompts)} prompts. Each prompt should be associated with an image."
204
+ # )
205
+ # if _is_str_or_image(prompts):
206
+ # prompts = [prompts]
207
+ # elif isinstance(prompts, list) and _is_str_or_image(prompts[0]):
208
+ # pass
209
+
210
+ # # add \n after image tokens
211
+ # prompts = [f"\n<|user|>\n{prompt}{self.tokenizer.eos_token}\n" for prompt in prompts]
212
+ # # TODO: tokenize the prompt twice, and check if the prompt is too long
213
+ # prompt_length = [len(self.tokenizer.tokenize(prompt)) + self.image_seq_length for prompt in prompts]
214
+
215
+
216
+ # if labels is not None:
217
+ # if _is_str_or_image(labels):
218
+ # labels = [labels] # convert it to list if it is a string
219
+ # labels = [f"<|assistant|>\n{label}{self.tokenizer.eos_token}" for label in labels]
220
+
221
+ # text = [f"{prompt}{label}" for prompt, label in zip(prompts, labels)]
222
+
223
+ # else:
224
+ # text = prompts
225
+
226
+ # assert len(images) == len(text), "The number of images and text should be the same."
227
+
228
+ # input_strings = [
229
+ # build_string_from_input(
230
+ # image_seq_len=self.image_seq_length,
231
+ # image_token=IMAGE_TOKEN,
232
+ # )
233
+ # for _ in text
234
+ # ]
235
+
236
+ # # this will do some image processing, like resizing, normalizing, etc.
237
+ # pixel_values = self.image_processor(
238
+ # images,
239
+ # do_resize=do_resize,
240
+ # do_normalize=do_normalize,
241
+ # return_tensors=return_tensors,
242
+ # image_mean=image_mean,
243
+ # image_std=image_std,
244
+ # input_data_format=input_data_format,
245
+ # data_format=data_format,
246
+ # resample=resample,
247
+ # do_convert_rgb=do_convert_rgb,
248
+ # )["pixel_values"]
249
+
250
+ # if max_length is not None:
251
+ # max_length += self.image_seq_length # max_length has to account for the image tokens
252
+
253
+ # # modify the token_type_ids here by text_pair ?
254
+ # inputs = self.tokenizer(
255
+ # input_strings,
256
+ # text_pair=labels,
257
+ # return_tensors=return_tensors,
258
+ # padding=padding,
259
+ # max_length=max_length,
260
+ # truncation=truncation,
261
+ # return_token_type_ids=return_token_type_ids,
262
+ # )
263
+
264
+ # return_data = {**inputs, "pixel_values": pixel_values}
265
+
266
+ # # we are doing training, so we need to return the labels
267
+ # if labels is not None:
268
+ # # fill the labels with -100 where we don't have to compute the loss
269
+ # # mask the padding part
270
+ # labels = inputs["input_ids"].masked_fill(inputs["attention_mask"] == 0, -100)
271
+ # # mask the image + prompt part, so that we don't train the model to predict the image tokens
272
+ # import torch
273
+ # prompt_length_tensor = torch.tensor(prompt_length)
274
+ # labels = labels.masked_fill(torch.arange(labels.size(1)).unsqueeze(0) < prompt_length_tensor.unsqueeze(1), -100)
275
+ # return_data.update({"labels": labels})
276
+
277
+ # return BatchFeature(data=return_data)
278
+
279
+ # # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->GPT2
280
+ # def batch_decode(self, *args, **kwargs):
281
+ # """
282
+ # This method forwards all its arguments to GPT2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
283
+ # refer to the docstring of this method for more information.
284
+ # """
285
+ # return self.tokenizer.batch_decode(*args, **kwargs)
286
+
287
+ # # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->GPT2
288
+ # def decode(self, *args, **kwargs):
289
+ # """
290
+ # This method forwards all its arguments to GPT2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
291
+ # the docstring of this method for more information.
292
+ # """
293
+ # return self.tokenizer.decode(*args, **kwargs)
294
+
295
+ # @property
296
+ # # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->TraVision
297
+ # def model_input_names(self):
298
+ # tokenizer_input_names = self.tokenizer.model_input_names
299
+ # image_processor_input_names = self.image_processor.model_input_names
300
+ # return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
301
+
302
+
303
+
304
+
305
+
306
+
307
  import logging
308
  from typing import List, Optional, Union
309
 
310
  from transformers.feature_extraction_utils import BatchFeature
311
  from transformers.image_utils import ImageInput, is_valid_image
312
  from transformers.processing_utils import ProcessorMixin
313
+ from transformers.tokenization_utils_base import (
314
  AddedToken,
315
  PaddingStrategy,
316
  PreTokenizedInput,
317
  TextInput,
318
+ TruncationStrategy,
319
  )
320
  from transformers.utils import TensorType
321
+
322
 
323
  logger = logging.getLogger(__name__)
324
 
325
  IMAGE_TOKEN = "<image>"
326
 
327
+
328
  # Copied from transformers.models.idefics2.processing_idefics2.is_url
329
  def is_url(val) -> bool:
330
  return isinstance(val, str) and val.startswith("http")
 
334
  def is_image_or_image_url(elem):
335
  return is_url(elem) or is_valid_image(elem)
336
 
337
+
338
  def _is_str_or_image(elem):
339
  return isinstance(elem, (str)) or is_image_or_image_url(elem)
340
 
341
 
342
+ def build_string_from_input(prompt, bos_token, image_seq_len, image_token):
343
  """
344
  Builds a string from the input prompt and image tokens.
345
  For example, for the call:
346
  build_string_from_input(
347
+ prompt="Prefix str"
348
+ bos_token="<s>",
349
  image_seq_len=3,
350
  image_token="<im>",
351
  )
352
  The output will be:
353
+ "<im><im><im><s>Initial str"
354
  Args:
355
+ prompt (`List[Union[str, ImageInput]]`): The input prompt.
356
+ bos_token (`str`): The beginning of sentence token.
357
  image_seq_len (`int`): The length of the image sequence.
358
  image_token (`str`): The image token.
359
  """
360
+ return f"{image_token * image_seq_len}{bos_token}{prompt}\n"
361
 
362
 
363
  class TaiVisionProcessor(ProcessorMixin):
364
  r"""
365
+ Constructs a PaliGemma processor which wraps a PaliGemma image processor and a PaliGemma tokenizer into a single processor.
366
 
367
+ [`PaliGemmaProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`LlamaTokenizerFast`]. See the
368
+ [`~PaliGemmaProcessor.__call__`] and [`~PaliGemmaProcessor.decode`] for more information.
369
 
370
  Args:
371
  image_processor ([`SiglipImageProcessor`], *optional*):
 
405
  tokenizer.add_eos_token = False
406
 
407
  super().__init__(image_processor, tokenizer, chat_template=chat_template)
408
+
409
  def __call__(
410
  self,
411
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
412
  images: ImageInput = None,
413
+ tokenize_newline_separately: bool = True,
414
  padding: Union[bool, str, PaddingStrategy] = False,
415
  truncation: Union[bool, str, TruncationStrategy] = None,
416
  max_length=None,
 
428
  do_thumbnail: bool = None,
429
  do_align_long_axis: bool = None,
430
  do_rescale: bool = None,
431
+ suffix: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
432
  ) -> BatchFeature:
433
  """
434
  Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
435
+ and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
436
  the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
437
  SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
438
  of the above two methods for more information.
439
 
440
+ The usage for PaliGemma fine-tuning preparation is slightly different than usual. suffix passed are suffixes to
441
+ the prompt in `text`, and will be placed after the prompt. This is because attention is handled differently for
442
+ the prefix and the suffix. For instance,
443
+ ```python
444
+ image = PIL_cow_image
445
+ prompt = "answer en Where is the cow standing?"
446
+ suffix = "on the beach"
447
+ inputs = processor(text=prompt, images=image, suffix=suffix)
448
+ ```
449
+ Here `inputs` will contain the `input_ids` and `token_type_ids` that follow
450
+ ```python
451
+ inputs["input_ids"][:, 256:]
452
+ # tensor([[ 2, 6006, 603, 573, 13910, 9980, 235336, 108, 477, 573, 8318]])
453
+ inputs["token_type_ids"][:, 256:]
454
+ tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]])
455
+ ```
456
+ Meaning the last three tokens are of "label" ("suffix") type while the other ones are of "prefix" type.
457
+
458
 
459
  Args:
460
+ text (`str`, `List[str]`, `List[List[str]]`):
461
  The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
462
  (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
463
  `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
464
  images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
465
  The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
466
  tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
467
+ number of channels, H and W are image height and width.
468
+ tokenize_newline_separately (`bool`, defaults to `True`):
469
+ Adds a separately tokenized '\n' at the end of the prompt.
470
  padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
471
  Select a strategy to pad the returned sequences (according to the model's padding side and padding
472
  index) among:
 
487
  - `'pt'`: Return PyTorch `torch.Tensor` objects.
488
  - `'np'`: Return NumPy `np.ndarray` objects.
489
  - `'jax'`: Return JAX `jnp.ndarray` objects.
490
+ suffix (`str`, `List[str]`, `List[List[str]]`):
491
+ The suffixes or batch of suffixes to be encoded. Only necessary for finetuning. See https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md
492
+ for more information. If your prompt is "<image> What is on the image", the suffix corresponds to the expected prediction "a cow sitting on a bench".
 
493
 
494
  Returns:
495
  [`BatchFeature`]: A [`BatchFeature`] with the following fields:
496
 
497
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `suffix`
498
+ is provided, the `input_ids` will also contain the suffix input ids.
499
  - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
500
  `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
501
  `None`).
502
  - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
503
+ - **labels** -- Labels compatible with training if `suffix` is not None
504
  """
505
 
506
+ return_token_type_ids = True if suffix is not None else False
 
507
 
508
  if images is None:
509
+ raise ValueError("`images` are expected as arguments to a `PaliGemmaProcessor` instance.")
510
+ if text is None:
 
 
 
511
  logger.warning_once(
512
+ "You are using PaliGemma without a text prefix. It will perform as a picture-captioning model."
513
  )
514
+ text = ""
515
+
516
+ if isinstance(text, List) and isinstance(images, List):
 
 
517
  if len(images) < len(text):
518
  raise ValueError(
519
+ f"Received {len(images)} images for {len(text)} prompts. Each prompt should be associated with an image."
520
  )
521
+ if _is_str_or_image(text):
522
+ text = [text]
523
+ elif isinstance(text, list) and _is_str_or_image(text[0]):
524
  pass
525
+ if suffix is not None and _is_str_or_image(suffix):
526
+ suffix = [suffix]
527
+ if suffix is not None:
528
+ suffix = [sfx + self.tokenizer.eos_token for sfx in suffix]
 
 
 
 
 
 
 
529
 
 
 
 
 
 
 
 
530
  input_strings = [
531
  build_string_from_input(
532
+ prompt=prompt,
533
+ bos_token=self.tokenizer.bos_token,
534
  image_seq_len=self.image_seq_length,
535
  image_token=IMAGE_TOKEN,
536
+ )
537
+ for prompt in text
538
  ]
539
+
 
540
  pixel_values = self.image_processor(
541
  images,
542
  do_resize=do_resize,
 
553
  if max_length is not None:
554
  max_length += self.image_seq_length # max_length has to account for the image tokens
555
 
 
556
  inputs = self.tokenizer(
557
  input_strings,
558
+ text_pair=suffix,
559
  return_tensors=return_tensors,
560
  padding=padding,
561
  max_length=max_length,
 
565
 
566
  return_data = {**inputs, "pixel_values": pixel_values}
567
 
568
+ if return_token_type_ids:
569
+ labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
 
 
 
 
 
 
 
570
  return_data.update({"labels": labels})
 
571
  return BatchFeature(data=return_data)
572
 
573
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Gemma
574
  def batch_decode(self, *args, **kwargs):
575
  """
576
+ This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
577
  refer to the docstring of this method for more information.
578
  """
579
  return self.tokenizer.batch_decode(*args, **kwargs)
580
 
581
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.decode with CLIP->Gemma
582
  def decode(self, *args, **kwargs):
583
  """
584
+ This method forwards all its arguments to GemmaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
585
  the docstring of this method for more information.
586
  """
587
  return self.tokenizer.decode(*args, **kwargs)
588
 
589
  @property
590
+ # Copied from transformers.models.clip.processing_clip.CLIPProcessor.model_input_names with CLIP->PaliGemma
591
  def model_input_names(self):
592
  tokenizer_input_names = self.tokenizer.model_input_names
593
  image_processor_input_names = self.image_processor.model_input_names
594
  return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
595
 
 
 
 
 
 
 
 
 
596
 
597
+
598
+ if __name__ == '__main__':
599
+ from configuration_taivisionlm import TaiVisionLMConfig
600
+ import transformers
601
+ import torch
602
+ config = TaiVisionLMConfig.from_pretrained("./")
603
+ preprocessor = transformers.SiglipImageProcessor.from_pretrained("google/siglip-base-patch16-224")
604
+ preprocessor.image_seq_length = config.num_image_tokens
605
+ tokenizer = transformers.AutoTokenizer.from_pretrained("benchang1110/Taiwan-tinyllama-v1.0-chat")
606
+ processor = TaiVisionProcessor(tokenizer=tokenizer, image_processor=preprocessor)
607
+ processor.save_pretrained("./")
608
+
609
+ from PIL import Image
610
+ import requests
611
+ processor = TaiVisionProcessor.from_pretrained("./")
612
+ url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/car.jpg"
613
+ image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
614
+ text = "Hello< what is your name?"
615
+ suffix = "I am fine, thank you."
616
+ inputs = processor(text=text,suffix=suffix,images=image, return_tensors="pt",padding="max_length",max_length=512)
617
+ print(inputs['attention_mask'].shape)
618
+ print(inputs['input_ids'].shape)
619
+ print(inputs['token_type_ids'].shape)
620
+ # print number of 0 in token_type_ids
621
+ print(torch.sum(inputs['token_type_ids']==0))
622
+ print(inputs)