Imran1
/

Qwen2.5-72B-Instruct-FP8

Safetensors

qwen2

Model card Files Files and versions Community

Imran1 commited on 14 days ago

Commit

325c145

•

1 Parent(s): 629b5aa

Update code/inference.py

Browse files

Files changed (1) hide show

code/inference.py +29 -9

code/inference.py CHANGED Viewed

@@ -4,7 +4,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 from typing import List, Dict
 from accelerate import load_checkpoint_and_dispatch
 import fcntl  # For file locking
 # Global variables to persist the model and tokenizer between invocations
 model = None
 tokenizer = None
@@ -16,12 +16,13 @@ def format_chat(messages: List[Dict[str, str]], tokenizer) -> str:
     """
     return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
-# Model loading function for SageMaker with tensor parallelism and offloading
 def model_fn(model_dir, context=None):
     global model, tokenizer
     # Path to lock file for ensuring single loading
     lock_file = "/tmp/model_load.lock"
     if model is not None:
         print("Model already loaded, skipping reload.")
@@ -33,27 +34,46 @@ def model_fn(model_dir, context=None):
         fcntl.flock(lock, fcntl.LOCK_EX)  # Exclusive lock
         try:
             if model is None:
                 print("Loading the model and tokenizer...")
                 offload_dir = "/tmp/offload_dir"
                 os.makedirs(offload_dir, exist_ok=True)
-                # Load and dispatch model
-                model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype=torch.float16)
                 model = load_checkpoint_and_dispatch(
                     model,
                     model_dir,
-                    device_map="auto",
-                    offload_folder=offload_dir
                 )
                 # Load the tokenizer
                 tokenizer = AutoTokenizer.from_pretrained(model_dir)
-            else:
-                print("Another process loaded the model while waiting for the lock.")
         finally:
             # Release the lock
             fcntl.flock(lock, fcntl.LOCK_UN)

 from typing import List, Dict
 from accelerate import load_checkpoint_and_dispatch
 import fcntl  # For file locking
+import os
 # Global variables to persist the model and tokenizer between invocations
 model = None
 tokenizer = None
     """
     return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 def model_fn(model_dir, context=None):
     global model, tokenizer
     # Path to lock file for ensuring single loading
     lock_file = "/tmp/model_load.lock"
+    # Path to in-progress file indicating model loading is happening
+    in_progress_file = "/tmp/model_loading_in_progress"
     if model is not None:
         print("Model already loaded, skipping reload.")
         fcntl.flock(lock, fcntl.LOCK_EX)  # Exclusive lock
         try:
+            # Check if another worker is in the process of loading
+            if os.path.exists(in_progress_file):
+                print("Another worker is currently loading the model, waiting...")
+                # Poll the in-progress flag until the other worker finishes loading
+                while os.path.exists(in_progress_file):
+                    time.sleep(5)  # Wait for 5 seconds before checking again
+                print("Loading complete by another worker, skipping reload.")
+                return model, tokenizer
+            # If no one is loading, start loading the model and set the in-progress flag
+            print("No one is loading, proceeding to load the model.")
+            with open(in_progress_file, 'w') as f:
+                f.write("loading")
             if model is None:
                 print("Loading the model and tokenizer...")
                 offload_dir = "/tmp/offload_dir"
                 os.makedirs(offload_dir, exist_ok=True)
+                # Load and dispatch model across 8 GPUs using tensor parallelism
+                model = AutoModelForCausalLM.from_pretrained(model_dir, torch_dtype="auto")
                 model = load_checkpoint_and_dispatch(
                     model,
                     model_dir,
+                    device_map="auto",  # Automatically map layers across GPUs
+                    offload_folder=offload_dir,  # Offload parts to disk if needed
+                    max_memory={i: "24GiB" for i in range(8)}  # Set memory limit per GPU
                 )
                 # Load the tokenizer
                 tokenizer = AutoTokenizer.from_pretrained(model_dir)
         finally:
+            # Remove the in-progress flag once the loading is complete
+            if os.path.exists(in_progress_file):
+                os.remove(in_progress_file)
             # Release the lock
             fcntl.flock(lock, fcntl.LOCK_UN)