Update modeling_Llamoe.py

Files changed (1) hide show

modeling_Llamoe.py CHANGED Viewed

@@ -589,9 +589,11 @@ class LlamoeSdpaAttention(LlamoeAttention):
         print("after_rb_value_states:",value_states)
         causal_mask = attention_mask
         if attention_mask is not None and cache_position is not None:
             causal_mask = causal_mask[:, :, cache_position, : key_states.shape[-2]]
         # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
         # Reference: https://github.com/pytorch/pytorch/issues/112577.
         if query_states.device.type == "cuda" and causal_mask is not None:

         print("after_rb_value_states:",value_states)
         causal_mask = attention_mask
+        print("causal_mask:",causal_mask)
         if attention_mask is not None and cache_position is not None:
             causal_mask = causal_mask[:, :, cache_position, : key_states.shape[-2]]
+        print("after_causal_masks:",causal_mask)
         # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
         # Reference: https://github.com/pytorch/pytorch/issues/112577.
         if query_states.device.type == "cuda" and causal_mask is not None: