Update modeling_Llamoe.py
Browse files- modeling_Llamoe.py +2 -0
modeling_Llamoe.py
CHANGED
@@ -589,9 +589,11 @@ class LlamoeSdpaAttention(LlamoeAttention):
|
|
589 |
print("after_rb_value_states:",value_states)
|
590 |
|
591 |
causal_mask = attention_mask
|
|
|
592 |
if attention_mask is not None and cache_position is not None:
|
593 |
causal_mask = causal_mask[:, :, cache_position, : key_states.shape[-2]]
|
594 |
|
|
|
595 |
# SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
|
596 |
# Reference: https://github.com/pytorch/pytorch/issues/112577.
|
597 |
if query_states.device.type == "cuda" and causal_mask is not None:
|
|
|
589 |
print("after_rb_value_states:",value_states)
|
590 |
|
591 |
causal_mask = attention_mask
|
592 |
+
print("causal_mask:",causal_mask)
|
593 |
if attention_mask is not None and cache_position is not None:
|
594 |
causal_mask = causal_mask[:, :, cache_position, : key_states.shape[-2]]
|
595 |
|
596 |
+
print("after_causal_masks:",causal_mask)
|
597 |
# SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
|
598 |
# Reference: https://github.com/pytorch/pytorch/issues/112577.
|
599 |
if query_states.device.type == "cuda" and causal_mask is not None:
|