merterbak commited on
Commit
450d1bc
·
verified ·
1 Parent(s): 6e74502

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +84 -0
app.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
3
+ import torch
4
+ from threading import Thread
5
+ import time
6
+
7
+ model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
8
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
9
+ tokenizer.chat_template = "{% for message in messages %}<|im_start|>{{ message.role }}\n{{ message.content }}<|im_end|>\n{% endfor %}<|im_start|>assistant\n"
10
+
11
+ model = AutoModelForCausalLM.from_pretrained(
12
+ model_name,
13
+ torch_dtype=torch.float32,
14
+ device_map="cpu",
15
+ low_cpu_mem_usage=True
16
+ ).to('cpu')
17
+ class deepstreamer(TextIteratorStreamer):
18
+ def __init__(self, tokenizer):
19
+ super().__init__(tokenizer, skip_prompt=True, skip_special_tokens=True)
20
+ self.token_count = 0
21
+ self.start_time = None
22
+
23
+ def put(self, value):
24
+ if self.start_time is None:
25
+ self.start_time = time.time()
26
+ self.token_count += 1
27
+ return super().put(value)
28
+
29
+ def get_tps(self):
30
+ if self.start_time is None:
31
+ return 0
32
+ return self.token_count / (time.time() - self.start_time)
33
+
34
+ def format_response(text, tps=None): #token per second
35
+ return f"{text}\n\n**Tokens per second:** {tps:.2f}" if tps else text
36
+ def chat_response(message, history, max_tokens=512):
37
+ messages = []
38
+ for human, assistant in history:
39
+ messages.append({"role": "user", "content": human})
40
+ messages.append({"role": "assistant", "content": assistant})
41
+ messages.append({"role": "user", "content": message})
42
+ formatted_input = tokenizer.apply_chat_template(
43
+ messages,
44
+ tokenize=False,
45
+ add_generation_prompt=True
46
+ )
47
+ inputs = tokenizer(formatted_input, return_tensors="pt").to('cpu')
48
+ streamer = deepstreamer(tokenizer)
49
+ generation_kwargs = dict(
50
+ inputs,
51
+ streamer=streamer,
52
+ max_new_tokens=max_tokens,
53
+ do_sample=True,
54
+ temperature=0.7,
55
+ top_p=0.9,
56
+ eos_token_id=tokenizer.eos_token_id
57
+ )
58
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
59
+ thread.start()
60
+ partial_response = ""
61
+ try:
62
+ for token in streamer:
63
+ partial_response += token
64
+ yield format_response(partial_response, streamer.get_tps())
65
+
66
+ final_tps = streamer.token_count / (time.time() - streamer.start_time)
67
+ yield format_response(partial_response, final_tps)
68
+ finally:
69
+ thread.join()
70
+
71
+ demo = gr.ChatInterface(
72
+ fn=chat_response,
73
+ title="DeepSeek-R1-Distill-Qwen-1.5B on CPU",
74
+ description="Runnig on CPU so expect less tokens",
75
+ examples=[
76
+ "Discuss the future of renewable energy",
77
+ "What's the history of the Roman Empire?",
78
+ "What's the capital of China?",
79
+ "Tell me a fun fact about space"
80
+ ]
81
+ )
82
+
83
+ if __name__ == "__main__":
84
+ demo.queue().launch()