anemll
/

anemll-Meta-Llama-3.2-3B-ctx512_0.1.1

@@ -243,18 +243,28 @@ def load_metadata(model,args):
         else:
             ctx_len = args.context_length
-        # Use defaults
         metadata['context_length'] = ctx_len
         metadata['state_length'] = ctx_len
-        metadata['batch_size'] = 64
         metadata['lut_bits'] = 4
-        metadata['num_chunks'] = 4
-        print("\nUsing default parameters:")
         print(f"  Context Length: {metadata['context_length']}")
         print(f"  State Length: {metadata['state_length']}")
         print(f"  Prefill Batch Size: {metadata['batch_size']}")
         print(f"  LUT Bits: {metadata['lut_bits']}")
         print(f"  Number of Chunks: {metadata['num_chunks']}")
     return metadata
 def load_models(args,metadata):
@@ -376,11 +386,19 @@ def make_causal_mask(length, start):
     mask[:, :, col_indices <= (row_indices + start)] = 0
     return mask
-def run_prefill(embed_model, ffn_models, input_ids, context_pos, context_length, batch_size=64, state=None):
-    """Run prefill on the input sequence."""
-    # Create causal mask
     causal_mask = make_causal_mask(context_length, 0)
     causal_mask = torch.tensor(causal_mask, dtype=torch.float16)
     # Process in batches
     batch_pos = 0
@@ -423,7 +441,7 @@ def run_prefill(embed_model, ffn_models, input_ids, context_pos, context_length,
     return torch.tensor([context_pos], dtype=torch.int32)
-def generate_next_token(embed_model, ffn_models, lmhead_model, input_ids, pos, context_length, state=None, temperature=0.0):
     """Generate the next token."""
     # Get current token
     current_token = input_ids[:, pos-1:pos]  # [1, 1]
@@ -437,8 +455,13 @@ def generate_next_token(embed_model, ffn_models, lmhead_model, input_ids, pos, c
     update_mask = torch.zeros((1, 1, context_length, 1), dtype=torch.float16)
     update_mask[0, 0, pos-1, 0] = 1.0
     position_ids = torch.tensor([pos-1], dtype=torch.int32)  # [1]
-    causal_mask = make_causal_mask(context_length, 0)
-    causal_mask = torch.tensor(causal_mask[:, :, pos-1:pos, :], dtype=torch.float16)  # [1, 1, 1, context_length]
     # Run through FFN chunks with state
     for ffn_model in ffn_models:
@@ -447,7 +470,7 @@ def generate_next_token(embed_model, ffn_models, lmhead_model, input_ids, pos, c
                 'hidden_states': hidden_states.numpy(),
                 'update_mask': update_mask.numpy(),
                 'position_ids': position_ids.numpy(),
-                'causal_mask': causal_mask.numpy(),
                 'current_pos': position_ids.numpy()
             }
             output = ffn_model['infer'].predict(inputs, state)
@@ -493,7 +516,7 @@ def create_unified_state(ffn_models, context_length):
         print("\nCreated unified transformer state")
         return state
-def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state, auto_prompt=None, warmup=False):
     """Interactive chat loop."""
     context_length = metadata.get('context_length')
     batch_size = metadata.get('batch_size', 64)
@@ -567,7 +590,7 @@ def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state,
                 # Start prefill timing
                 prefill_start = time.time()
-                # Run prefill with state
                 current_pos = run_prefill(
                     embed_model,
                     ffn_models,
@@ -575,7 +598,8 @@ def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state,
                     context_pos,
                     context_length,
                     batch_size,
-                    state
                 )
                 # Calculate prefill timing
@@ -590,7 +614,7 @@ def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state,
                 inference_tokens = 0
                 while pos < context_length - 1:
-                    # Generate next token
                     next_token = generate_next_token(
                         embed_model,
                         ffn_models,
@@ -598,7 +622,8 @@ def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state,
                         input_ids,
                         pos,
                         context_length,
-                        state
                     )
                     # Add token to sequence
@@ -657,7 +682,7 @@ def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state,
         traceback.print_exc()
 def parse_args():
-    parser = argparse.ArgumentParser(description='Chat with CoreML LLaMA (c) 2025 Anemll')
     # Add meta.yaml option
     parser.add_argument('--meta', type=str, help='Path to meta.yaml to load all parameters')
@@ -678,9 +703,15 @@ def parse_args():
     parser.add_argument('--prompt', type=str,
                        help='If specified, run once with this prompt and exit')
     # Model configuration
     parser.add_argument('--context-length', type=int,
                        help='Context length for the model (default: 512), if not provided, it will be detected from the model directory name ctxNUMBER')
     args = parser.parse_args()
@@ -711,9 +742,11 @@ def parse_args():
             if not args.tokenizer:
                 args.tokenizer = args.d
-            # Set other parameters
-            args.context_length = int(params['context_length'])
-            args.batch_size = int(params['batch_size'])
             args.num_chunks = num_chunks
             print(f"\nLoaded parameters from {args.meta}:")
@@ -782,18 +815,23 @@ def main():
         # Create unified state once
         state = create_unified_state(ffn_models, metadata['context_length'])
         # Warmup runs to prevent Python GIL issues with CoreML !
-        for i in range(2):
-            chat_loop(
-                embed_model=embed_model,
-                ffn_models=ffn_models,
-                lmhead_model=lmhead_model,
-                tokenizer=tokenizer,
-                metadata=metadata,
-                state=state,
-                warmup=True,
-                auto_prompt="who are you?"
-            )
         # Main run
         chat_loop(
@@ -803,6 +841,7 @@ def main():
             tokenizer=tokenizer,
             metadata=metadata,
             state=state,
             warmup=False,
             auto_prompt=args.prompt
         )

         else:
             ctx_len = args.context_length
+        # Use defaults or values from args
         metadata['context_length'] = ctx_len
         metadata['state_length'] = ctx_len
+        # Get batch size from args or use default
+        metadata['batch_size'] = getattr(args, 'batch_size', 64)
         metadata['lut_bits'] = 4
+        metadata['num_chunks'] = getattr(args, 'num_chunks', 4)
+        print("\nUsing parameters:")
         print(f"  Context Length: {metadata['context_length']}")
         print(f"  State Length: {metadata['state_length']}")
         print(f"  Prefill Batch Size: {metadata['batch_size']}")
         print(f"  LUT Bits: {metadata['lut_bits']}")
         print(f"  Number of Chunks: {metadata['num_chunks']}")
+    # Override with values from args if they exist
+    if hasattr(args, 'batch_size') and args.batch_size is not None:
+        metadata['batch_size'] = args.batch_size
+        print(f"\nOverriding batch size from args: {args.batch_size}")
+    if hasattr(args, 'num_chunks') and args.num_chunks is not None:
+        metadata['num_chunks'] = args.num_chunks
+        print(f"\nOverriding num chunks from args: {args.num_chunks}")
     return metadata
 def load_models(args,metadata):
     mask[:, :, col_indices <= (row_indices + start)] = 0
     return mask
+def initialize_causal_mask(context_length):
+    """Initialize causal mask for transformer attention."""
     causal_mask = make_causal_mask(context_length, 0)
     causal_mask = torch.tensor(causal_mask, dtype=torch.float16)
+    print(f"\nInitialized causal mask for context length {context_length}")
+    return causal_mask
+def run_prefill(embed_model, ffn_models, input_ids, context_pos, context_length, batch_size=64, state=None, causal_mask=None):
+    """Run prefill on the input sequence."""
+    # Use provided causal mask or create one if not provided
+    if causal_mask is None:
+        causal_mask = make_causal_mask(context_length, 0)
+        causal_mask = torch.tensor(causal_mask, dtype=torch.float16)
     # Process in batches
     batch_pos = 0
     return torch.tensor([context_pos], dtype=torch.int32)
+def generate_next_token(embed_model, ffn_models, lmhead_model, input_ids, pos, context_length, state=None, causal_mask=None, temperature=0.0):
     """Generate the next token."""
     # Get current token
     current_token = input_ids[:, pos-1:pos]  # [1, 1]
     update_mask = torch.zeros((1, 1, context_length, 1), dtype=torch.float16)
     update_mask[0, 0, pos-1, 0] = 1.0
     position_ids = torch.tensor([pos-1], dtype=torch.int32)  # [1]
+    # Use provided causal mask or create one if not provided
+    if causal_mask is None:
+        causal_mask_data = make_causal_mask(context_length, 0)
+        single_causal_mask = torch.tensor(causal_mask_data[:, :, pos-1:pos, :], dtype=torch.float16)  # [1, 1, 1, context_length]
+    else:
+        single_causal_mask = causal_mask[:, :, pos-1:pos, :]
     # Run through FFN chunks with state
     for ffn_model in ffn_models:
                 'hidden_states': hidden_states.numpy(),
                 'update_mask': update_mask.numpy(),
                 'position_ids': position_ids.numpy(),
+                'causal_mask': single_causal_mask.numpy(),
                 'current_pos': position_ids.numpy()
             }
             output = ffn_model['infer'].predict(inputs, state)
         print("\nCreated unified transformer state")
         return state
+def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state, causal_mask=None, auto_prompt=None, warmup=False):
     """Interactive chat loop."""
     context_length = metadata.get('context_length')
     batch_size = metadata.get('batch_size', 64)
                 # Start prefill timing
                 prefill_start = time.time()
+                # Run prefill with state and causal mask
                 current_pos = run_prefill(
                     embed_model,
                     ffn_models,
                     context_pos,
                     context_length,
                     batch_size,
+                    state,
+                    causal_mask
                 )
                 # Calculate prefill timing
                 inference_tokens = 0
                 while pos < context_length - 1:
+                    # Generate next token with causal mask
                     next_token = generate_next_token(
                         embed_model,
                         ffn_models,
                         input_ids,
                         pos,
                         context_length,
+                        state,
+                        causal_mask
                     )
                     # Add token to sequence
         traceback.print_exc()
 def parse_args():
+    parser = argparse.ArgumentParser(description='Chat with CoreML LLaMA, gil resolved  (c) 2025 Anemll')
     # Add meta.yaml option
     parser.add_argument('--meta', type=str, help='Path to meta.yaml to load all parameters')
     parser.add_argument('--prompt', type=str,
                        help='If specified, run once with this prompt and exit')
+    # Add no-warmup flag
+    parser.add_argument('--nw', action='store_true',
+                       help='Skip warmup phase')
     # Model configuration
     parser.add_argument('--context-length', type=int,
                        help='Context length for the model (default: 512), if not provided, it will be detected from the model directory name ctxNUMBER')
+    parser.add_argument('--batch-size', type=int,
+                       help='Batch size for prefill (default: 64)')
     args = parser.parse_args()
             if not args.tokenizer:
                 args.tokenizer = args.d
+            # Set other parameters if not overridden by command line
+            if args.context_length is None:
+                args.context_length = int(params['context_length'])
+            if args.batch_size is None:
+                args.batch_size = int(params['batch_size'])
             args.num_chunks = num_chunks
             print(f"\nLoaded parameters from {args.meta}:")
         # Create unified state once
         state = create_unified_state(ffn_models, metadata['context_length'])
+        # Initialize causal mask once
+        causal_mask = initialize_causal_mask(metadata['context_length'])
         # Warmup runs to prevent Python GIL issues with CoreML !
+        if not args.nw:
+            for i in range(2):
+                chat_loop(
+                    embed_model=embed_model,
+                    ffn_models=ffn_models,
+                    lmhead_model=lmhead_model,
+                    tokenizer=tokenizer,
+                    metadata=metadata,
+                    state=state,
+                    causal_mask=causal_mask,  # Pass the causal mask
+                    warmup=True,
+                    auto_prompt="who are you?"
+                )
         # Main run
         chat_loop(
             tokenizer=tokenizer,
             metadata=metadata,
             state=state,
+            causal_mask=causal_mask,  # Pass the causal mask
             warmup=False,
             auto_prompt=args.prompt
         )