DeepSeek-R1-Qwen-7B

Running on Zero

App Files Files Community

nikravan commited on 2 days ago

Commit

a637d3e

verified ·

1 Parent(s): 38fae63

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -21

app.py CHANGED Viewed

@@ -53,32 +53,25 @@ latex_delimiters_set = [{
 @spaces.GPU()
 def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
-    stop_tokens = ["<|endoftext|>", "<|im_end|>", "|im_end|"]
     instruction = '<|im_start|>system\n' + system_prompt + '\n<|im_end|>\n'
     for user, assistant in history:
         instruction += f'<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n'
     instruction += f'<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n'
-    print("Formatted Instruction:", instruction)
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
     enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True)
     input_ids, attention_mask = enc.input_ids, enc.attention_mask
     if input_ids.shape[1] > CONTEXT_LENGTH:
         input_ids = input_ids[:, -CONTEXT_LENGTH:]
         attention_mask = attention_mask[:, -CONTEXT_LENGTH:]
-    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
-    # Define the generation parameters
     generate_kwargs = dict(
         input_ids=input_ids.to(device),
         attention_mask=attention_mask.to(device),
@@ -88,24 +81,18 @@ def predict(message, history, system_prompt, temperature, max_new_tokens, top_k,
         max_new_tokens=max_new_tokens,
         top_k=top_k,
         repetition_penalty=repetition_penalty,
-        top_p=top_p,
-        pad_token_id=tokenizer.pad_token_id,  # Explicitly set pad_token_id
-        eos_token_id=tokenizer.eos_token_id,  # Explicitly set eos_token_id
     )
-    # Start the generation in a separate thread
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
-    # Stream the output token by token
     outputs = []
     for new_token in streamer:
         outputs.append(new_token)
-        if any(stop_token in new_token for stop_token in stop_tokens):
             break
         yield "".join(outputs)
 # Load model
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 quantization_config = BitsAndBytesConfig(

 @spaces.GPU()
 def predict(message, history, system_prompt, temperature, max_new_tokens, top_k, repetition_penalty, top_p):
+    # Format history with a given chat template
+    stop_tokens = ["<|endoftext|>", "<|im_end|>","|im_end|"]
     instruction = '<|im_start|>system\n' + system_prompt + '\n<|im_end|>\n'
     for user, assistant in history:
         instruction += f'<|im_start|>user\n{user}\n<|im_end|>\n<|im_start|>assistant\n{assistant}\n<|im_end|>\n'
     instruction += f'<|im_start|>user\n{message}\n<|im_end|>\n<|im_start|>assistant\n'
+    print(instruction)
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
     enc = tokenizer(instruction, return_tensors="pt", padding=True, truncation=True)
     input_ids, attention_mask = enc.input_ids, enc.attention_mask
     if input_ids.shape[1] > CONTEXT_LENGTH:
         input_ids = input_ids[:, -CONTEXT_LENGTH:]
         attention_mask = attention_mask[:, -CONTEXT_LENGTH:]
     generate_kwargs = dict(
         input_ids=input_ids.to(device),
         attention_mask=attention_mask.to(device),
         max_new_tokens=max_new_tokens,
         top_k=top_k,
         repetition_penalty=repetition_penalty,
+        top_p=top_p
     )
     t = Thread(target=model.generate, kwargs=generate_kwargs)
     t.start()
     outputs = []
     for new_token in streamer:
         outputs.append(new_token)
+        if new_token in stop_tokens:
             break
         yield "".join(outputs)
 # Load model
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 quantization_config = BitsAndBytesConfig(