microsoft/Phi-4-multimodal-instruct · RuntimeError when running the demo code

Hi! I'm trying to run the example script in the docs, but get the following shape mismatch error:
File ~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-4-multimodal-instruct/879783f7b23e43c12d1c682e3458f115f3a7718d/modeling_phi4mm.py:2116, in Phi4MMForCausalLM.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, input_image_embeds, image_sizes, image_attention_mask, input_audio_embeds, audio_embed_sizes, audio_attention_mask, input_mode, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, num_logits_to_keep)
   2113     raise ValueError(f"Invalid input_mode: {input_mode}")
   2115 # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-> 2116 outputs = self.model(
   2117     input_ids=input_ids,
   2118     attention_mask=attention_mask,
   2119     position_ids=position_ids,
   2120     past_key_values=past_key_values,
   2121     inputs_embeds=inputs_embeds,
   2122     input_image_embeds=input_image_embeds,
   2123     image_sizes=image_sizes,
   2124     image_attention_mask=image_attention_mask,
   2125     input_audio_embeds=input_audio_embeds,
   2126     audio_embed_sizes=audio_embed_sizes,
   2127     audio_attention_mask=audio_attention_mask,
   2128     audio_projection_mode=audio_projection_mode,
   2129     use_cache=use_cache,
   2130     output_attentions=output_attentions,
   2131     output_hidden_states=output_hidden_states,
   2132     return_dict=return_dict,
   2133 )
   2135 hidden_states = outputs[0]
   2136 # Only compute necessary logits, and do not upcast them to float if we are not computing the loss

File ~/micromamba/envs/neuralset/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1531 else:
-> 1532     return self._call_impl(*args, **kwargs)

File ~/micromamba/envs/neuralset/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
   1536 # If we don't have any hooks, we want to skip the rest of the logic in
   1537 # this function, and just call forward.
   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1539         or _global_backward_pre_hooks or _global_backward_hooks
   1540         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541     return forward_call(*args, **kwargs)
   1543 try:
   1544     result = None

File ~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-4-multimodal-instruct/879783f7b23e43c12d1c682e3458f115f3a7718d/modeling_phi4mm.py:1707, in Phi4MMModel.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, input_image_embeds, image_sizes, image_attention_mask, input_audio_embeds, audio_embed_sizes, audio_attention_mask, audio_projection_mode, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, **kwargs)
   1700         logger.warning_once(
   1701             "We detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and "
   1702             "will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class "
   1703             "(https://huggingface.co./docs/transformers/kv_cache#legacy-cache-format)"
   1704         )
   1706 if inputs_embeds is None:
-> 1707     inputs_embeds = self.embed_tokens_extend(
   1708         input_ids=input_ids,
   1709         input_embeds=inputs_embeds,
   1710         input_image_embeds=input_image_embeds,
   1711         input_audio_embeds=input_audio_embeds,
   1712         image_sizes=image_sizes,
   1713         image_attention_mask=image_attention_mask,
   1714         audio_embed_sizes=audio_embed_sizes,
   1715         audio_attention_mask=audio_attention_mask,
   1716         audio_projection_mode=audio_projection_mode,
   1717         wte=self.embed_tokens,
   1718     )
   1720 if cache_position is None:
   1721     past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0

File ~/micromamba/envs/neuralset/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1531 else:
-> 1532     return self._call_impl(*args, **kwargs)

File ~/micromamba/envs/neuralset/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
   1536 # If we don't have any hooks, we want to skip the rest of the logic in
   1537 # this function, and just call forward.
   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1539         or _global_backward_pre_hooks or _global_backward_hooks
   1540         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541     return forward_call(*args, **kwargs)
   1543 try:
   1544     result = None

File ~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-4-multimodal-instruct/879783f7b23e43c12d1c682e3458f115f3a7718d/modeling_phi4mm.py:769, in Phi4MMImageAudioEmbedding.forward(self, input_ids, input_embeds, input_image_embeds, input_audio_embeds, image_sizes, image_attention_mask, audio_embed_sizes, audio_attention_mask, audio_projection_mode, wte)
    766     assert input_image_embeds is not None or input_audio_embeds is not None
    768 if input_image_embeds is not None:
--> 769     image_hidden_states = self.image_embed(
    770         input_ids=input_ids,
    771         input_embeds=input_image_embeds,
    772         image_sizes=image_sizes,
    773         wte=wte,
    774         image_attention_mask=image_attention_mask
    775     )
    776 if input_audio_embeds is not None:
    777     audio_hidden_states = self.audio_embed(
    778         input_ids=input_ids,
    779         input_embeds=input_audio_embeds,
   (...)
    783         audio_projection_mode=audio_projection_mode,
    784     )

File ~/micromamba/envs/neuralset/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1531 else:
-> 1532     return self._call_impl(*args, **kwargs)

File ~/micromamba/envs/neuralset/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
   1536 # If we don't have any hooks, we want to skip the rest of the logic in
   1537 # this function, and just call forward.
   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1539         or _global_backward_pre_hooks or _global_backward_hooks
   1540         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541     return forward_call(*args, **kwargs)
   1543 try:
   1544     result = None

File ~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-4-multimodal-instruct/879783f7b23e43c12d1c682e3458f115f3a7718d/modeling_phi4mm.py:328, in Phi4MMImageEmbedding.forward(self, input_ids, input_embeds, image_sizes, **kwargs)
    326 # Nx(HW)xC
    327 if image_attention_mask is not None and len(image_attention_mask) > 0:
--> 328     img_features = self.get_img_features(img_embeds.flatten(0, 1), attention_mask=image_attention_mask.type(torch.BoolTensor).flatten(0,1).to(target_device))
    329 else:
    330     img_features = self.get_img_features(img_embeds.flatten(0, 1))

File ~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-4-multimodal-instruct/879783f7b23e43c12d1c682e3458f115f3a7718d/modeling_phi4mm.py:194, in Phi4MMImageEmbedding.get_img_features(self, img_embeds, attention_mask)
    192 else:
    193     if attention_mask is not None:
--> 194         img_processor_output = self.img_processor(img_embeds, output_hidden_states=True, patch_attention_mask=attention_mask)
    195     else:
    196         img_processor_output = self.img_processor(img_embeds, output_hidden_states=True)

File ~/micromamba/envs/neuralset/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1531 else:
-> 1532     return self._call_impl(*args, **kwargs)

File ~/micromamba/envs/neuralset/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
   1536 # If we don't have any hooks, we want to skip the rest of the logic in
   1537 # this function, and just call forward.
   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1539         or _global_backward_pre_hooks or _global_backward_hooks
   1540         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541     return forward_call(*args, **kwargs)
   1543 try:
   1544     result = None

File ~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-4-multimodal-instruct/879783f7b23e43c12d1c682e3458f115f3a7718d/vision_siglip_navit.py:1370, in SiglipVisionTransformer.forward(self, pixel_values, patch_attention_mask, output_attentions, output_hidden_states, return_dict)
   1359 if patch_attention_mask is None:
   1360     patch_attention_mask = torch.ones(
   1361         size=(
   1362             batch_size,
   (...)
   1367         device=pixel_values.device,
   1368     )
-> 1370 hidden_states = self.embeddings(pixel_values=pixel_values, patch_attention_mask=patch_attention_mask)
   1372 patch_attention_mask = patch_attention_mask.view(batch_size, -1)
   1373 # The call to `_upad_input` in `_flash_attention_forward` is expensive
   1374 # So when the `patch_attention_mask` is full of 1s (i.e. attending to the whole sequence),
   1375 # avoiding passing the attention_mask, which is equivalent to attending to the full sequence

File ~/micromamba/envs/neuralset/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1531 else:
-> 1532     return self._call_impl(*args, **kwargs)

File ~/micromamba/envs/neuralset/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
   1536 # If we don't have any hooks, we want to skip the rest of the logic in
   1537 # this function, and just call forward.
   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1539         or _global_backward_pre_hooks or _global_backward_hooks
   1540         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541     return forward_call(*args, **kwargs)
   1543 try:
   1544     result = None

File ~/.cache/huggingface/modules/transformers_modules/microsoft/Phi-4-multimodal-instruct/879783f7b23e43c12d1c682e3458f115f3a7718d/vision_siglip_navit.py:599, in SiglipVisionEmbeddings.forward(self, pixel_values, patch_attention_mask)
    596     bucket_coords_w = torch.bucketize(fractional_coords_w, boundaries, right=True)
    598     pos_ids = (bucket_coords_h[:, None] * self.num_patches_per_side + bucket_coords_w).flatten()
--> 599     position_ids[batch_idx][p_attn_mask.view(-1).cpu()] = pos_ids
    601 position_ids = position_ids.to(self.position_embedding.weight.device)
    603 embeddings = embeddings + self.position_embedding(position_ids)

RuntimeError: shape mismatch: value tensor of shape [1024] cannot be broadcast to indexing result of shape [992]
Automatic pdb calling has been turned OFF
The autoreload extension is already loaded. To reload it, use: