pawlowskipawel commited on
Commit
b93e004
1 Parent(s): 43425cd

update generate function

Browse files
Files changed (1) hide show
  1. modeling_florence2.py +11 -5
modeling_florence2.py CHANGED
@@ -2655,7 +2655,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
2655
  return image_features, image_attention_mask
2656
 
2657
  task_prefix_embeds = inputs_embeds
2658
-
2659
  if task_prefix_attention_mask is None:
2660
  task_prefix_attention_mask = torch.ones(batch_size, task_prefix_embeds.size(1), device=device)
2661
 
@@ -2721,12 +2721,14 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
2721
  >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
2722
  "A green car parked in front of a yellow building."
2723
  ```"""
 
 
2724
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
2725
  output_hidden_states = (
2726
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
2727
  )
2728
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
2729
-
2730
  image_features = None
2731
  if inputs_embeds is None:
2732
  # 1. Extra the input embeddings
@@ -2736,8 +2738,10 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
2736
  if pixel_values is not None:
2737
  # (batch_size, num_image_tokens, hidden_size)
2738
  image_features = self._encode_image(pixel_values)
2739
- inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds, task_prefix_attention_mask=attention_mask)
2740
-
 
 
2741
  if inputs_embeds is not None:
2742
  attention_mask = attention_mask.to(inputs_embeds.dtype)
2743
  outputs = self.language_model(
@@ -2783,6 +2787,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
2783
  input_ids,
2784
  inputs_embeds=None,
2785
  pixel_values=None,
 
2786
  **kwargs
2787
  ):
2788
 
@@ -2793,11 +2798,12 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel):
2793
  # 2. Merge text and images
2794
  if pixel_values is not None:
2795
  image_features = self._encode_image(pixel_values)
2796
- inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
2797
 
2798
  return self.language_model.generate(
2799
  input_ids=None,
2800
  inputs_embeds=inputs_embeds,
 
2801
  **kwargs
2802
  )
2803
 
 
2655
  return image_features, image_attention_mask
2656
 
2657
  task_prefix_embeds = inputs_embeds
2658
+
2659
  if task_prefix_attention_mask is None:
2660
  task_prefix_attention_mask = torch.ones(batch_size, task_prefix_embeds.size(1), device=device)
2661
 
 
2721
  >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
2722
  "A green car parked in front of a yellow building."
2723
  ```"""
2724
+ print("asdasdasdasdasdasdasdasda")
2725
+
2726
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
2727
  output_hidden_states = (
2728
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
2729
  )
2730
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
2731
+ print("asdasdasdasdasdasdasdasda")
2732
  image_features = None
2733
  if inputs_embeds is None:
2734
  # 1. Extra the input embeddings
 
2738
  if pixel_values is not None:
2739
  # (batch_size, num_image_tokens, hidden_size)
2740
  image_features = self._encode_image(pixel_values)
2741
+ inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds)
2742
+
2743
+ print(attention_mask)
2744
+
2745
  if inputs_embeds is not None:
2746
  attention_mask = attention_mask.to(inputs_embeds.dtype)
2747
  outputs = self.language_model(
 
2787
  input_ids,
2788
  inputs_embeds=None,
2789
  pixel_values=None,
2790
+ attention_mask=None,
2791
  **kwargs
2792
  ):
2793
 
 
2798
  # 2. Merge text and images
2799
  if pixel_values is not None:
2800
  image_features = self._encode_image(pixel_values)
2801
+ inputs_embeds, attention_mask = self._merge_input_ids_with_image_features(image_features, inputs_embeds, task_prefix_attention_mask=attention_mask)
2802
 
2803
  return self.language_model.generate(
2804
  input_ids=None,
2805
  inputs_embeds=inputs_embeds,
2806
+ attention_mask=attention_mask,
2807
  **kwargs
2808
  )
2809