RuntimeError when running the demo code
#10
by
sdascoli
- opened
Hi! I'm trying to run the example script in the docs, but get the following shape mismatch error:
File ~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-4-multimodal-instruct/879783f7b23e43c12d1c682e3458f115f3a7718d/modeling_phi4mm.py:2116, in Phi4MMForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, input_image_embeds, image_sizes, image_attention_mask, input_audio_embeds, audio_embed_sizes, audio_attention_mask, input_mode, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, num_logits_to_keep)
2113 raise ValueError(f"Invalid input_mode: {input_mode}")
2115 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-> 2116 outputs = self.model(
2117 input_ids=input_ids,
2118 attention_mask=attention_mask,
2119 position_ids=position_ids,
2120 past_key_values=past_key_values,
2121 inputs_embeds=inputs_embeds,
2122 input_image_embeds=input_image_embeds,
2123 image_sizes=image_sizes,
2124 image_attention_mask=image_attention_mask,
2125 input_audio_embeds=input_audio_embeds,
2126 audio_embed_sizes=audio_embed_sizes,
2127 audio_attention_mask=audio_attention_mask,
2128 audio_projection_mode=audio_projection_mode,
2129 use_cache=use_cache,
2130 output_attentions=output_attentions,
2131 output_hidden_states=output_hidden_states,
2132 return_dict=return_dict,
2133 )
2135 hidden_states = outputs[0]
2136 # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
File ~/micromamba/envs/neuralset/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1531 else:
-> 1532 return self._call_impl(*args, **kwargs)
File ~/micromamba/envs/neuralset/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
1543 try:
1544 result = None
File ~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-4-multimodal-instruct/879783f7b23e43c12d1c682e3458f115f3a7718d/modeling_phi4mm.py:1707, in Phi4MMModel.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, input_image_embeds, image_sizes, image_attention_mask, input_audio_embeds, audio_embed_sizes, audio_attention_mask, audio_projection_mode, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, **kwargs)
1700 logger.warning_once(
1701 "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
1702 "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
1703 "(https://huggingface.co./docs/transformers/kv_cache#legacy-cache-format)"
1704 )
1706 if inputs_embeds is None:
-> 1707 inputs_embeds = self.embed_tokens_extend(
1708 input_ids=input_ids,
1709 input_embeds=inputs_embeds,
1710 input_image_embeds=input_image_embeds,
1711 input_audio_embeds=input_audio_embeds,
1712 image_sizes=image_sizes,
1713 image_attention_mask=image_attention_mask,
1714 audio_embed_sizes=audio_embed_sizes,
1715 audio_attention_mask=audio_attention_mask,
1716 audio_projection_mode=audio_projection_mode,
1717 wte=self.embed_tokens,
1718 )
1720 if cache_position is None:
1721 past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
File ~/micromamba/envs/neuralset/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1531 else:
-> 1532 return self._call_impl(*args, **kwargs)
File ~/micromamba/envs/neuralset/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
1543 try:
1544 result = None
File ~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-4-multimodal-instruct/879783f7b23e43c12d1c682e3458f115f3a7718d/modeling_phi4mm.py:769, in Phi4MMImageAudioEmbedding.forward(self, input_ids, input_embeds, input_image_embeds, input_audio_embeds, image_sizes, image_attention_mask, audio_embed_sizes, audio_attention_mask, audio_projection_mode, wte)
766 assert input_image_embeds is not None or input_audio_embeds is not None
768 if input_image_embeds is not None:
--> 769 image_hidden_states = self.image_embed(
770 input_ids=input_ids,
771 input_embeds=input_image_embeds,
772 image_sizes=image_sizes,
773 wte=wte,
774 image_attention_mask=image_attention_mask
775 )
776 if input_audio_embeds is not None:
777 audio_hidden_states = self.audio_embed(
778 input_ids=input_ids,
779 input_embeds=input_audio_embeds,
(...)
783 audio_projection_mode=audio_projection_mode,
784 )
File ~/micromamba/envs/neuralset/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1531 else:
-> 1532 return self._call_impl(*args, **kwargs)
File ~/micromamba/envs/neuralset/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
1543 try:
1544 result = None
File ~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-4-multimodal-instruct/879783f7b23e43c12d1c682e3458f115f3a7718d/modeling_phi4mm.py:328, in Phi4MMImageEmbedding.forward(self, input_ids, input_embeds, image_sizes, **kwargs)
326 # Nx(HW)xC
327 if image_attention_mask is not None and len(image_attention_mask) > 0:
--> 328 img_features = self.get_img_features(img_embeds.flatten(0, 1), attention_mask=image_attention_mask.type(torch.BoolTensor).flatten(0,1).to(target_device))
329 else:
330 img_features = self.get_img_features(img_embeds.flatten(0, 1))
File ~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-4-multimodal-instruct/879783f7b23e43c12d1c682e3458f115f3a7718d/modeling_phi4mm.py:194, in Phi4MMImageEmbedding.get_img_features(self, img_embeds, attention_mask)
192 else:
193 if attention_mask is not None:
--> 194 img_processor_output = self.img_processor(img_embeds, output_hidden_states=True, patch_attention_mask=attention_mask)
195 else:
196 img_processor_output = self.img_processor(img_embeds, output_hidden_states=True)
File ~/micromamba/envs/neuralset/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1531 else:
-> 1532 return self._call_impl(*args, **kwargs)
File ~/micromamba/envs/neuralset/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
1543 try:
1544 result = None
File ~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-4-multimodal-instruct/879783f7b23e43c12d1c682e3458f115f3a7718d/vision_siglip_navit.py:1370, in SiglipVisionTransformer.forward(self, pixel_values, patch_attention_mask, output_attentions, output_hidden_states, return_dict)
1359 if patch_attention_mask is None:
1360 patch_attention_mask = torch.ones(
1361 size=(
1362 batch_size,
(...)
1367 device=pixel_values.device,
1368 )
-> 1370 hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
1372 patch_attention_mask = patch_attention_mask.view(batch_size, -1)
1373 # The call to `_upad_input` in `_flash_attention_forward` is expensive
1374 # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
1375 # avoiding passing the attention_mask, which is equivalent to attending to the full sequence
File ~/micromamba/envs/neuralset/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1531 else:
-> 1532 return self._call_impl(*args, **kwargs)
File ~/micromamba/envs/neuralset/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
1543 try:
1544 result = None
File ~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-4-multimodal-instruct/879783f7b23e43c12d1c682e3458f115f3a7718d/vision_siglip_navit.py:599, in SiglipVisionEmbeddings.forward(self, pixel_values, patch_attention_mask)
596 bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
598 pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
--> 599 position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
601 position_ids = position_ids.to(self.position_embedding.weight.device)
603 embeddings = embeddings + self.position_embedding(position_ids)
RuntimeError: shape mismatch: value tensor of shape [1024] cannot be broadcast to indexing result of shape [992]
Automatic pdb calling has been turned OFF
The autoreload extension is already loaded. To reload it, use:
sdascoli
changed discussion status to
closed