airfoundry
/

MC-LLaVA-3b-live

Inference Endpoints

Model card Files Files and versions Community

wilbown commited on Mar 14

Commit

5a30066

•

1 Parent(s): 8b0fa0e

Fix to handle None for image

Files changed (1) hide show

modeling_llava.py +3 -3

modeling_llava.py CHANGED Viewed

@@ -1661,7 +1661,7 @@ class LlavaForCausalLM(LlavaPreTrainedModel):
             else:
                 # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
                 # generation with cache
-                if past_key_values is not None and image_features is not None and input_ids.shape[1] == 1:
                     # Retrieve the first layer to inspect the logits and mask out the hidden states
                     # that are set to 0
                     first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
@@ -1734,6 +1734,7 @@ class LlavaForCausalLM(LlavaPreTrainedModel):
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
@@ -1746,8 +1747,7 @@ class LlavaForCausalLM(LlavaPreTrainedModel):
                 input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
             # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
             # input_ids based on the past_length.
-            elif past_length < input_ids.shape[1]+image_features.shape[1]-1:
-                past_length -= image_features.shape[1]-1
                 input_ids = input_ids[:, past_length:]
                 attention_mask = attention_mask[:, past_length:]
             # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.

             else:
                 # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
                 # generation with cache
+                if past_key_values is not None and input_ids.shape[1] == 1:
                     # Retrieve the first layer to inspect the logits and mask out the hidden states
                     # that are set to 0
                     first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
             else:
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None
+            past_length -= image_features.shape[1]-1 if image_features is not None else 0
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
                 input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
             # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
             # input_ids based on the past_length.
+            elif past_length < input_ids.shape[1]:
                 input_ids = input_ids[:, past_length:]
                 attention_mask = attention_mask[:, past_length:]
             # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.