anemll
/

anemll-Meta-Llama-3.2-3B-ctx512_0.1.1

@@ -28,6 +28,8 @@ RESET_COLOR = "\033[0m"
 # Add at the top with other constants
 WARMUP_TOKEN_LIMIT = 10  # Maximum tokens to generate during warmup
 class TokenPrinter:
     """Handles background printing of generated tokens."""
@@ -191,6 +193,89 @@ def load_model(path, function_name=None):
             print("\nTry using the .mlpackage version instead, or recompile the model.")
         raise
 def load_metadata(model,args):
     # Extract metadata and config parameters
     metadata = {}
@@ -246,18 +331,28 @@ def load_metadata(model,args):
         else:
             ctx_len = args.context_length
-        # Use defaults
         metadata['context_length'] = ctx_len
         metadata['state_length'] = ctx_len
-        metadata['batch_size'] = 64
         metadata['lut_bits'] = 4
-        metadata['num_chunks'] = 4
-        print("\nUsing default parameters:")
         print(f"  Context Length: {metadata['context_length']}")
         print(f"  State Length: {metadata['state_length']}")
         print(f"  Prefill Batch Size: {metadata['batch_size']}")
         print(f"  LUT Bits: {metadata['lut_bits']}")
         print(f"  Number of Chunks: {metadata['num_chunks']}")
     return metadata
 def load_models(args,metadata):
@@ -379,7 +474,7 @@ def make_causal_mask(length, start):
     mask[:, :, col_indices <= (row_indices + start)] = 0
     return mask
-def run_prefill(embed_model, ffn_models, input_ids, current_pos, context_length, batch_size, state):
     """Run prefill on the input sequence."""
     #print(f"[DEBUG] Running prefill from 0 to {current_pos}")
@@ -404,9 +499,7 @@ def run_prefill(embed_model, ffn_models, input_ids, current_pos, context_length,
         # Generate position IDs for this batch
         position_ids = torch.arange(batch_pos, batch_pos + batch_size, dtype=torch.int32)
-        # Create causal mask for this batch
-        causal_mask = make_causal_mask(context_length, 0)  # Always start from 0 for prefill
-        causal_mask = torch.tensor(causal_mask, dtype=torch.float16)
         batch_causal_mask = causal_mask[:, :, batch_pos:batch_pos + batch_size, :]
         # Run embeddings
@@ -430,7 +523,7 @@ def run_prefill(embed_model, ffn_models, input_ids, current_pos, context_length,
     return torch.tensor([current_pos], dtype=torch.int32)
-def generate_next_token(embed_model, ffn_models, lmhead_model, input_ids, pos, context_length, state=None, temperature=0.0):
     """Generate the next token."""
     # Get current token
     current_token = input_ids[:, pos-1:pos]
@@ -445,9 +538,8 @@ def generate_next_token(embed_model, ffn_models, lmhead_model, input_ids, pos, c
     update_mask[0, 0, pos-1, 0] = 1.0
     position_ids = torch.tensor([pos-1], dtype=torch.int32)
-    # Create causal mask for current position
-    causal_mask = make_causal_mask(context_length, 0)  # Always start from 0 for generation
-    single_causal_mask = torch.tensor(causal_mask[:, :, pos-1:pos, :], dtype=torch.float16)
     # Run through FFN chunks
     for ffn_model in ffn_models:
@@ -496,23 +588,84 @@ def create_unified_state(ffn_models, context_length):
         print("\nCreated unified transformer state")
         return state
 def get_user_input():
-    sys.stdout.write(f"\n{LIGHT_GREEN}You:{RESET_COLOR} ")
-    sys.stdout.flush()
-    line = sys.stdin.readline()
-    if not line:
-        raise EOFError
-    return line.rstrip('\n')
-def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state, auto_prompt=None, warmup=False):
     """Interactive chat loop."""
     context_length = metadata.get('context_length')
     batch_size = metadata.get('batch_size', 64)
     if not warmup:
         print(f"\nUsing context length: {context_length}")
         print("\nStarting chat session. Press Ctrl+D to exit.")
-        print("Type your message and press Enter to chat.")
     # Keep track of conversation history
     conversation = []
@@ -521,7 +674,7 @@ def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state,
         while True:
             try:
                 if not warmup:
-                    print(f"\n{LIGHT_GREEN}You:{RESET_COLOR}", end=' ', flush=True)
                 if auto_prompt is not None:
                     user_input = auto_prompt
                     if not warmup:
@@ -535,16 +688,31 @@ def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state,
             if not user_input:
                 continue
             # Add user message to conversation
             conversation.append({"role": "user", "content": user_input})
             # Format using chat template with full history
-            base_input_ids = tokenizer.apply_chat_template(
-                conversation,
-                return_tensors="pt",
-                add_generation_prompt=True
-            ).to(torch.int32)
             # Check if we need to trim history
             while base_input_ids.size(1) > context_length - 100:  # Leave room for response
@@ -579,10 +747,6 @@ def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state,
             generation_start_time = time.time()
             try:
-                # Create initial causal mask
-                causal_mask = make_causal_mask(context_length, 0)
-                causal_mask = torch.tensor(causal_mask, dtype=torch.float16)
                 # Run prefill on entire context
                 current_pos = run_prefill(
                     embed_model,
@@ -591,7 +755,8 @@ def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state,
                     context_pos,
                     context_length,
                     batch_size,
-                    state
                 )
                 #print(f"\n[DEBUG] After initial prefill - current_pos: {current_pos}")
@@ -625,7 +790,8 @@ def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state,
                             new_size,  # Prefill the entire shifted content
                             context_length,
                             batch_size,
-                            state
                         )
                         # Start generating from the next position
@@ -644,7 +810,8 @@ def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state,
                         input_ids,
                         pos,
                         context_length,
-                        state
                     )
                     # Add token
@@ -697,77 +864,7 @@ def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state,
             traceback.print_exc()
 def main():
-    parser = argparse.ArgumentParser(description='Full Chat with CoreML LLaMA with context window shifting (c) 2025 Anemll')
-    # Add meta.yaml option
-    parser.add_argument('--meta', type=str, help='Path to meta.yaml to load all parameters')
-    # Add existing arguments
-    parser.add_argument('--d', '--dir', type=str, default='.',
-                       help='Directory containing model files (default: current directory)')
-    parser.add_argument('--embed', type=str, required=False,
-                       help='Path to embeddings model (relative to --dir)')
-    parser.add_argument('--ffn', type=str, required=False,
-                       help='Path to FFN model (can be chunked, relative to --dir)')
-    parser.add_argument('--lmhead', type=str, required=False,
-                       help='Path to LM head model (relative to --dir)')
-    parser.add_argument('--tokenizer', type=str, required=False,
-                       help='Path to tokenizer')
-    # Add new argument for auto-generation
-    parser.add_argument('--prompt', type=str,
-                       help='If specified, run once with this prompt and exit')
-    # Model configuration
-    parser.add_argument('--context-length', type=int,
-                       help='Context length for the model (default: 512), if not provided, it will be detected from the model directory name ctxNUMBER')
-    args = parser.parse_args()
-    # If meta.yaml is provided, load parameters from it
-    if args.meta:
-        try:
-            with open(args.meta, 'r') as f:
-                meta = yaml.safe_load(f)
-            params = meta['model_info']['parameters']
-            # Set model directory to meta.yaml directory if not specified
-            if not args.d or args.d == '.':
-                args.d = str(Path(args.meta).parent)
-            # Build model paths based on parameters
-            prefix = params.get('model_prefix', 'llama')  # Default to 'llama' if not specified
-            lut_ffn = f"_lut{params['lut_ffn']}" if params['lut_ffn'] != 'none' else ''
-            lut_lmhead = f"_lut{params['lut_lmhead']}" if params['lut_lmhead'] != 'none' else ''
-            num_chunks = int(params['num_chunks'])
-            # Set model paths if not specified
-            if not args.embed:
-                args.embed = f'{prefix}_embeddings'
-            if not args.lmhead:
-                args.lmhead = f'{prefix}_lm_head{lut_lmhead}'
-            if not args.ffn:
-                args.ffn = f'{prefix}_FFN_PF{lut_ffn}_chunk_01of{num_chunks:02d}'
-            if not args.tokenizer:
-                args.tokenizer = args.d
-            # Set other parameters
-            args.context_length = int(params['context_length'])
-            args.batch_size = int(params['batch_size'])
-            args.num_chunks = num_chunks
-            print(f"\nLoaded parameters from {args.meta}:")
-            print(f"  Context Length: {args.context_length}")
-            print(f"  Batch Size: {args.batch_size}")
-            print(f"  Num Chunks: {args.num_chunks}")
-            print(f"  Models Directory: {args.d}")
-            print(f"  Embeddings: {args.embed}")
-            print(f"  LM Head: {args.lmhead}")
-            print(f"  FFN: {args.ffn}")
-        except Exception as e:
-            print(f"\nError loading meta.yaml: {str(e)}")
-            sys.exit(1)
     # Convert directory to absolute path
     model_dir = Path(args.d).resolve()
@@ -817,18 +914,23 @@ def main():
         # Create unified state once
         state = create_unified_state(ffn_models, metadata['context_length'])
         # Warmup runs to prevent Python GIL issues with CoreML !
-        for i in range(2):
-            chat_loop(
-                embed_model=embed_model,
-                ffn_models=ffn_models,
-                lmhead_model=lmhead_model,
-                tokenizer=tokenizer,
-                metadata=metadata,
-                state=state,  # Pass the state
-                warmup=True,
-                auto_prompt="who are you?"
-            )
         # Main run
         chat_loop(
@@ -838,6 +940,7 @@ def main():
             tokenizer=tokenizer,
             metadata=metadata,
             state=state,  # Pass the state
             warmup=False,
             auto_prompt=args.prompt
         )

 # Add at the top with other constants
 WARMUP_TOKEN_LIMIT = 10  # Maximum tokens to generate during warmup
+THINKING_MODE = False
+THINKING_PROMPT = """You are a deep thinking AI, you may use extremely long chains of thought to deeply consider the problem and deliberate with yourself via systematic reasoning processes to help come to a correct solution prior to answering. You should enclose your thoughts and internal monologue inside <think> </think> tags, and then provide your solution or response to the problem."""
 class TokenPrinter:
     """Handles background printing of generated tokens."""
             print("\nTry using the .mlpackage version instead, or recompile the model.")
         raise
+def parse_args():
+    parser = argparse.ArgumentParser(description='Full Chat with CoreML LLaMA with context window shifting, gil resolved (c) 2025 Anemll')
+    # Add meta.yaml option
+    parser.add_argument('--meta', type=str, help='Path to meta.yaml to load all parameters')
+    # Add existing arguments
+    parser.add_argument('--d', '--dir', type=str, default='.',
+                       help='Directory containing model files (default: current directory)')
+    parser.add_argument('--embed', type=str, required=False,
+                       help='Path to embeddings model (relative to --dir)')
+    parser.add_argument('--ffn', type=str, required=False,
+                       help='Path to FFN model (can be chunked, relative to --dir)')
+    parser.add_argument('--lmhead', type=str, required=False,
+                       help='Path to LM head model (relative to --dir)')
+    parser.add_argument('--tokenizer', type=str, required=False,
+                       help='Path to tokenizer')
+    # Add new argument for auto-generation
+    parser.add_argument('--prompt', type=str,
+                       help='If specified, run once with this prompt and exit')
+    # Add no-warmup flag
+    parser.add_argument('--nw', action='store_true',
+                       help='Skip warmup phase')
+    # Model configuration
+    parser.add_argument('--context-length', type=int,
+                       help='Context length for the model (default: 512), if not provided, it will be detected from the model directory name ctxNUMBER')
+    parser.add_argument('--batch-size', type=int,
+                       help='Batch size for prefill (default: 64)')
+    args = parser.parse_args()
+    # If meta.yaml is provided, load parameters from it
+    if args.meta:
+        try:
+            with open(args.meta, 'r') as f:
+                meta = yaml.safe_load(f)
+            params = meta['model_info']['parameters']
+            # Set model directory to meta.yaml directory if not specified
+            if not args.d or args.d == '.':
+                args.d = str(Path(args.meta).parent)
+            # Build model paths based on parameters
+            prefix = params.get('model_prefix', 'llama')  # Default to 'llama' if not specified
+            lut_ffn = f"_lut{params['lut_ffn']}" if params['lut_ffn'] != 'none' else ''
+            lut_lmhead = f"_lut{params['lut_lmhead']}" if params['lut_lmhead'] != 'none' else ''
+            num_chunks = int(params['num_chunks'])
+            # Set model paths if not specified
+            if not args.embed:
+                args.embed = f'{prefix}_embeddings'
+            if not args.lmhead:
+                args.lmhead = f'{prefix}_lm_head{lut_lmhead}'
+            if not args.ffn:
+                args.ffn = f'{prefix}_FFN_PF{lut_ffn}_chunk_01of{num_chunks:02d}'
+            if not args.tokenizer:
+                args.tokenizer = args.d
+            # Set other parameters if not overridden by command line
+            if args.context_length is None:
+                args.context_length = int(params['context_length'])
+            if args.batch_size is None:
+                args.batch_size = int(params['batch_size'])
+            args.num_chunks = num_chunks
+            print(f"\nLoaded parameters from {args.meta}:")
+            print(f"  Context Length: {args.context_length}")
+            print(f"  Batch Size: {args.batch_size}")
+            print(f"  Num Chunks: {args.num_chunks}")
+            print(f"  Models Directory: {args.d}")
+            print(f"  Embeddings: {args.embed}")
+            print(f"  LM Head: {args.lmhead}")
+            print(f"  FFN: {args.ffn}")
+        except Exception as e:
+            print(f"\nError loading meta.yaml: {str(e)}")
+            sys.exit(1)
+    return args
 def load_metadata(model,args):
     # Extract metadata and config parameters
     metadata = {}
         else:
             ctx_len = args.context_length
+        # Use defaults or values from args
         metadata['context_length'] = ctx_len
         metadata['state_length'] = ctx_len
+        # Get batch size from args or use default
+        metadata['batch_size'] = getattr(args, 'batch_size', 64)
         metadata['lut_bits'] = 4
+        metadata['num_chunks'] = getattr(args, 'num_chunks', 4)
+        print("\nUsing parameters:")
         print(f"  Context Length: {metadata['context_length']}")
         print(f"  State Length: {metadata['state_length']}")
         print(f"  Prefill Batch Size: {metadata['batch_size']}")
         print(f"  LUT Bits: {metadata['lut_bits']}")
         print(f"  Number of Chunks: {metadata['num_chunks']}")
+    # Override with values from args if they exist
+    if hasattr(args, 'batch_size') and args.batch_size is not None:
+        metadata['batch_size'] = args.batch_size
+        print(f"\nOverriding batch size from args: {args.batch_size}")
+    if hasattr(args, 'num_chunks') and args.num_chunks is not None:
+        metadata['num_chunks'] = args.num_chunks
+        print(f"\nOverriding num chunks from args: {args.num_chunks}")
     return metadata
 def load_models(args,metadata):
     mask[:, :, col_indices <= (row_indices + start)] = 0
     return mask
+def run_prefill(embed_model, ffn_models, input_ids, current_pos, context_length, batch_size, state, causal_mask):
     """Run prefill on the input sequence."""
     #print(f"[DEBUG] Running prefill from 0 to {current_pos}")
         # Generate position IDs for this batch
         position_ids = torch.arange(batch_pos, batch_pos + batch_size, dtype=torch.int32)
+        # Use the pre-initialized causal mask and extract the batch portion
         batch_causal_mask = causal_mask[:, :, batch_pos:batch_pos + batch_size, :]
         # Run embeddings
     return torch.tensor([current_pos], dtype=torch.int32)
+def generate_next_token(embed_model, ffn_models, lmhead_model, input_ids, pos, context_length, state, causal_mask, temperature=0.0):
     """Generate the next token."""
     # Get current token
     current_token = input_ids[:, pos-1:pos]
     update_mask[0, 0, pos-1, 0] = 1.0
     position_ids = torch.tensor([pos-1], dtype=torch.int32)
+    # Use the pre-initialized causal mask and extract the single position portion
+    single_causal_mask = causal_mask[:, :, pos-1:pos, :]
     # Run through FFN chunks
     for ffn_model in ffn_models:
         print("\nCreated unified transformer state")
         return state
+def initialize_causal_mask(context_length):
+    """Initialize causal mask for transformer attention."""
+    causal_mask = make_causal_mask(context_length, 0)
+    causal_mask = torch.tensor(causal_mask, dtype=torch.float16)
+    print(f"\nInitialized causal mask for context length {context_length}")
+    return causal_mask
 def get_user_input():
+    """Get input from user, handling special key combinations."""
+    global THINKING_MODE
+    try:
+        import termios
+        import tty
+        import sys
+        def _getch():
+            fd = sys.stdin.fileno()
+            old_settings = termios.tcgetattr(fd)
+            try:
+                tty.setraw(sys.stdin.fileno())
+                ch = sys.stdin.read(1)
+            finally:
+                termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
+            return ch
+        buffer = []
+        while True:
+            char = _getch()
+            # Debug: print the character code
+            print(f"\nKey pressed: {repr(char)} (hex: {hex(ord(char))})")
+            # Check for Enter key
+            if char == '\r' or char == '\n':
+                print()  # Move to next line
+                input_text = ''.join(buffer)
+                # Check if the command is /t
+                if input_text == '/t':
+                    THINKING_MODE = not THINKING_MODE
+                    print(f"Thinking mode {'ON' if THINKING_MODE else 'OFF'}")
+                    buffer = []  # Clear buffer
+                    print(f"\n{LIGHT_GREEN}You{' (thinking)' if THINKING_MODE else ''}:{RESET_COLOR}", end=' ', flush=True)
+                    continue
+                return input_text
+            # Handle backspace
+            if char == '\x7f':  # backspace
+                if buffer:
+                    buffer.pop()
+                    sys.stdout.write('\b \b')  # Erase character
+                    sys.stdout.flush()
+                continue
+            # Handle Ctrl-C
+            if char == '\x03':  # Ctrl-C
+                print("^C")
+                raise KeyboardInterrupt
+            # Print character and add to buffer
+            sys.stdout.write(char)
+            sys.stdout.flush()
+            buffer.append(char)
+    except ImportError:
+        # Fallback for systems without termios
+        return input("> ")
+def chat_loop(embed_model, ffn_models, lmhead_model, tokenizer, metadata, state, causal_mask, auto_prompt=None, warmup=False):
     """Interactive chat loop."""
+    global THINKING_MODE
     context_length = metadata.get('context_length')
     batch_size = metadata.get('batch_size', 64)
     if not warmup:
         print(f"\nUsing context length: {context_length}")
         print("\nStarting chat session. Press Ctrl+D to exit.")
+        print("Type your message and press Enter to chat. Use /t to toggle thinking mode.")
+        print(f"Thinking mode is {'ON' if THINKING_MODE else 'OFF'}")
     # Keep track of conversation history
     conversation = []
         while True:
             try:
                 if not warmup:
+                    print(f"\n{LIGHT_GREEN}You{' (thinking)' if THINKING_MODE else ''}:{RESET_COLOR}", end=' ', flush=True)
                 if auto_prompt is not None:
                     user_input = auto_prompt
                     if not warmup:
             if not user_input:
                 continue
+            # Handle /t command
+            if user_input == "/t":
+                THINKING_MODE = not THINKING_MODE
+                print(f"Thinking mode {'ON' if THINKING_MODE else 'OFF'}")
+                continue
             # Add user message to conversation
             conversation.append({"role": "user", "content": user_input})
             # Format using chat template with full history
+            if THINKING_MODE:
+                # Add thinking prompt to system message
+                conversation_with_thinking = [{"role": "system", "content": THINKING_PROMPT}] + conversation
+                base_input_ids = tokenizer.apply_chat_template(
+                    conversation_with_thinking,
+                    return_tensors="pt",
+                    add_generation_prompt=True
+                ).to(torch.int32)
+            else:
+                base_input_ids = tokenizer.apply_chat_template(
+                    conversation,
+                    return_tensors="pt",
+                    add_generation_prompt=True
+                ).to(torch.int32)
             # Check if we need to trim history
             while base_input_ids.size(1) > context_length - 100:  # Leave room for response
             generation_start_time = time.time()
             try:
                 # Run prefill on entire context
                 current_pos = run_prefill(
                     embed_model,
                     context_pos,
                     context_length,
                     batch_size,
+                    state,
+                    causal_mask
                 )
                 #print(f"\n[DEBUG] After initial prefill - current_pos: {current_pos}")
                             new_size,  # Prefill the entire shifted content
                             context_length,
                             batch_size,
+                            state,
+                            causal_mask
                         )
                         # Start generating from the next position
                         input_ids,
                         pos,
                         context_length,
+                        state,
+                        causal_mask
                     )
                     # Add token
             traceback.print_exc()
 def main():
+    args = parse_args()
     # Convert directory to absolute path
     model_dir = Path(args.d).resolve()
         # Create unified state once
         state = create_unified_state(ffn_models, metadata['context_length'])
+        # Initialize causal mask once
+        causal_mask = initialize_causal_mask(metadata['context_length'])
         # Warmup runs to prevent Python GIL issues with CoreML !
+        if not args.nw:
+            for i in range(2):
+                chat_loop(
+                    embed_model=embed_model,
+                    ffn_models=ffn_models,
+                    lmhead_model=lmhead_model,
+                    tokenizer=tokenizer,
+                    metadata=metadata,
+                    state=state,  # Pass the state
+                    causal_mask=causal_mask,  # Pass the causal mask
+                    warmup=True,
+                    auto_prompt="who are you?"
+                )
         # Main run
         chat_loop(
             tokenizer=tokenizer,
             metadata=metadata,
             state=state,  # Pass the state
+            causal_mask=causal_mask,  # Pass the causal mask
             warmup=False,
             auto_prompt=args.prompt
         )