kevin36524
/

qwen2_0_5_instruct_int4_coreml

Core ML

Model card Files Files and versions Community

kevin36524 commited on Aug 8, 2024

Commit

39ac59c

verified ·

1 Parent(s): 257e5c9

Upload modeling_qwen2_anotated.py with huggingface_hub

Browse files

Files changed (1) hide show

modeling_qwen2_anotated.py +13 -8

modeling_qwen2_anotated.py CHANGED Viewed

@@ -542,12 +542,10 @@ class Qwen2SdpaAttention(Qwen2Attention):
         query_states = self.q_proj(hidden_states)
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
-        #KEVINDEBUG query_states: torch.Size([1, 5|1, 896]) keystates:torch.Size([1, 5|1, 128]) value_states: torch.Size([1, 5|1, 128])
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        #KEVINDEBUG query_states: torch.Size([1, 14, 5|1, 64]) key_states:torch.Size([1, 2, 5|1, 64]) value_states: torch.Size([1, 2, 5|1, 64])
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
@@ -555,40 +553,47 @@ class Qwen2SdpaAttention(Qwen2Attention):
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
-        #KEVINDEBUG query_states: torch.Size([1, 14, 5|1, 64]) key_states:torch.Size([1, 2, 5|1, 64]) position_ids: torch.Size([1, 5|1])
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
-            #past_key_value class info : $<class 'transformers.cache_utils.DynamicCache'>
-            #KEVINDEBUG past_key_value is not None key_states: torch.Size([1, 2, 5...19, 64]) value_states: torch.Size([1, 2, 5..19, 64])
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
-        #KEVINDEBUG key_states: torch.Size([1, 14, 5..19, 64]) value_states: torch.Size([1, 14, 5..19, 64])
         causal_mask = attention_mask
         if attention_mask is not None:  # no matter the length, we just slice it
             causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
         # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
         # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
         # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
         is_causal = True if causal_mask is None and q_len > 1 else False
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
             attn_mask=causal_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
-            is_causal=is_causal,
         )
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.view(bsz, q_len, self.hidden_size)
         attn_output = self.o_proj(attn_output)
-        #KEVINDEBUG attn_output is $torch.Size([1, 5|1, 896])
         return attn_output, None, past_key_value

         query_states = self.q_proj(hidden_states)
         key_states = self.k_proj(hidden_states)
         value_states = self.v_proj(hidden_states)
         query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
         key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}  # Specific to RoPE models
             key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         causal_mask = attention_mask
         if attention_mask is not None:  # no matter the length, we just slice it
             causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and attention_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
         # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
         # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
         # The q_len > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case q_len == 1.
         is_causal = True if causal_mask is None and q_len > 1 else False
+        if is_causal:
+            L, S = query_states.size(-2), key_states.size(-2)
+            causal_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
         attn_output = torch.nn.functional.scaled_dot_product_attention(
             query_states,
             key_states,
             value_states,
             attn_mask=causal_mask,
             dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=False,
         )
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.view(bsz, q_len, self.hidden_size)
         attn_output = self.o_proj(attn_output)
         return attn_output, None, past_key_value