Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -11,25 +11,30 @@ import spaces
|
|
11 |
import time
|
12 |
import subprocess
|
13 |
|
|
|
14 |
subprocess.run(
|
15 |
"pip install flash-attn --no-build-isolation",
|
16 |
env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
|
17 |
shell=True,
|
18 |
)
|
19 |
|
|
|
20 |
token = os.environ["HF_TOKEN"]
|
21 |
|
22 |
-
|
23 |
model = AutoModelForCausalLM.from_pretrained(
|
24 |
"microsoft/Phi-3-mini-128k-instruct",
|
25 |
token=token,
|
26 |
trust_remote_code=True,
|
27 |
)
|
28 |
tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", token=token)
|
|
|
|
|
29 |
terminators = [
|
30 |
tok.eos_token_id,
|
31 |
]
|
32 |
|
|
|
33 |
if torch.cuda.is_available():
|
34 |
device = torch.device("cuda")
|
35 |
print(f"Using GPU: {torch.cuda.get_device_name(device)}")
|
@@ -38,37 +43,46 @@ else:
|
|
38 |
print("Using CPU")
|
39 |
|
40 |
model = model.to(device)
|
41 |
-
# Dispatch Errors
|
42 |
-
|
43 |
|
|
|
44 |
@spaces.GPU(duration=60)
|
45 |
def chat(message, history, temperature, do_sample, max_tokens):
|
|
|
46 |
chat = []
|
47 |
for item in history:
|
48 |
chat.append({"role": "user", "content": item[0]})
|
49 |
if item[1] is not None:
|
50 |
chat.append({"role": "assistant", "content": item[1]})
|
51 |
chat.append({"role": "user", "content": message})
|
|
|
|
|
52 |
messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
53 |
model_inputs = tok([messages], return_tensors="pt").to(device)
|
|
|
|
|
54 |
streamer = TextIteratorStreamer(
|
55 |
tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
|
56 |
)
|
|
|
|
|
57 |
generate_kwargs = dict(
|
58 |
model_inputs,
|
59 |
streamer=streamer,
|
60 |
-
max_new_tokens=max_tokens,
|
61 |
-
do_sample=True,
|
62 |
-
temperature=temperature,
|
63 |
-
eos_token_id=terminators,
|
64 |
)
|
65 |
|
|
|
66 |
if temperature == 0:
|
67 |
generate_kwargs["do_sample"] = False
|
68 |
|
|
|
69 |
t = Thread(target=model.generate, kwargs=generate_kwargs)
|
70 |
t.start()
|
71 |
|
|
|
72 |
partial_text = ""
|
73 |
for new_text in streamer:
|
74 |
partial_text += new_text
|
@@ -76,11 +90,10 @@ def chat(message, history, temperature, do_sample, max_tokens):
|
|
76 |
|
77 |
yield partial_text
|
78 |
|
79 |
-
|
80 |
demo = gr.ChatInterface(
|
81 |
fn=chat,
|
82 |
examples=[["Write me a poem about Machine Learning."]],
|
83 |
-
# multimodal=False,
|
84 |
additional_inputs_accordion=gr.Accordion(
|
85 |
label="โ๏ธ Parameters", open=False, render=False
|
86 |
),
|
@@ -102,4 +115,6 @@ demo = gr.ChatInterface(
|
|
102 |
title="Chat With LLMs",
|
103 |
description="Now Running [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)",
|
104 |
)
|
105 |
-
|
|
|
|
|
|
11 |
import time
|
12 |
import subprocess
|
13 |
|
14 |
+
# flash-attn ๋ผ์ด๋ธ๋ฌ๋ฆฌ ์ค์น. CUDA ๋น๋๋ ๊ฑด๋๋.
|
15 |
subprocess.run(
|
16 |
"pip install flash-attn --no-build-isolation",
|
17 |
env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
|
18 |
shell=True,
|
19 |
)
|
20 |
|
21 |
+
# Hugging Face ํ ํฐ ๊ฐ์ ธ์ค๊ธฐ
|
22 |
token = os.environ["HF_TOKEN"]
|
23 |
|
24 |
+
# microsoft/Phi-3-mini-128k-instruct ๋ชจ๋ธ๊ณผ ํ ํฌ๋์ด์ ๋ก๋
|
25 |
model = AutoModelForCausalLM.from_pretrained(
|
26 |
"microsoft/Phi-3-mini-128k-instruct",
|
27 |
token=token,
|
28 |
trust_remote_code=True,
|
29 |
)
|
30 |
tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct", token=token)
|
31 |
+
|
32 |
+
# ์ข
๋ฃ ํ ํฐ ID ์ค์
|
33 |
terminators = [
|
34 |
tok.eos_token_id,
|
35 |
]
|
36 |
|
37 |
+
# GPU๊ฐ ์ฌ์ฉ ๊ฐ๋ฅํ ๊ฒฝ์ฐ GPU๋ก, ์๋๋ฉด CPU๋ก ๋ชจ๋ธ ๋ก๋
|
38 |
if torch.cuda.is_available():
|
39 |
device = torch.device("cuda")
|
40 |
print(f"Using GPU: {torch.cuda.get_device_name(device)}")
|
|
|
43 |
print("Using CPU")
|
44 |
|
45 |
model = model.to(device)
|
|
|
|
|
46 |
|
47 |
+
# Spaces์ GPU ์์์ ์ฌ์ฉํ์ฌ chat ํจ์ ์คํ. ์ต๋ 60์ด ๋์ GPU ์์ ์ฌ์ฉ ๊ฐ๋ฅ.
|
48 |
@spaces.GPU(duration=60)
|
49 |
def chat(message, history, temperature, do_sample, max_tokens):
|
50 |
+
# ์ฑํ
๊ธฐ๋ก์ ์ ์ ํ ํ์์ผ๋ก ๋ณํ
|
51 |
chat = []
|
52 |
for item in history:
|
53 |
chat.append({"role": "user", "content": item[0]})
|
54 |
if item[1] is not None:
|
55 |
chat.append({"role": "assistant", "content": item[1]})
|
56 |
chat.append({"role": "user", "content": message})
|
57 |
+
|
58 |
+
# ํ ํฌ๋์ด์ ๋ฅผ ์ฌ์ฉํ์ฌ ์
๋ ฅ ์ฒ๋ฆฌ
|
59 |
messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
|
60 |
model_inputs = tok([messages], return_tensors="pt").to(device)
|
61 |
+
|
62 |
+
# TextIteratorStreamer๋ฅผ ์ฌ์ฉํ์ฌ ๋ชจ๋ธ ์ถ๋ ฅ ์คํธ๋ฆฌ๋ฐ
|
63 |
streamer = TextIteratorStreamer(
|
64 |
tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
|
65 |
)
|
66 |
+
|
67 |
+
# ์์ฑ ๊ด๋ จ ๋งค๊ฐ๋ณ์ ์ค์
|
68 |
generate_kwargs = dict(
|
69 |
model_inputs,
|
70 |
streamer=streamer,
|
71 |
+
max_new_tokens=max_tokens, # ์์ฑํ ์ต๋ ์ ํ ํฐ ์
|
72 |
+
do_sample=True, # ์ํ๋ง ์ฌ๋ถ
|
73 |
+
temperature=temperature, # ์จ๋ ๋งค๊ฐ๋ณ์. ๋์์๋ก ๋ค์์ฑ ์ฆ๊ฐ
|
74 |
+
eos_token_id=terminators, # ์ข
๋ฃ ํ ํฐ ID
|
75 |
)
|
76 |
|
77 |
+
# ์จ๋๊ฐ 0์ด๋ฉด ์ํ๋งํ์ง ์์
|
78 |
if temperature == 0:
|
79 |
generate_kwargs["do_sample"] = False
|
80 |
|
81 |
+
# ๋ณ๋ ์ค๋ ๋์์ ๋ชจ๋ธ ์์ฑ ์์
|
82 |
t = Thread(target=model.generate, kwargs=generate_kwargs)
|
83 |
t.start()
|
84 |
|
85 |
+
# ์์ฑ๋ ํ
์คํธ๋ฅผ ๋ฐ๋ณต์ ์ผ๋ก yield
|
86 |
partial_text = ""
|
87 |
for new_text in streamer:
|
88 |
partial_text += new_text
|
|
|
90 |
|
91 |
yield partial_text
|
92 |
|
93 |
+
# Gradio์ ChatInterface๋ฅผ ์ฌ์ฉํ์ฌ ๋ํํ ์ธํฐํ์ด์ค ์์ฑ
|
94 |
demo = gr.ChatInterface(
|
95 |
fn=chat,
|
96 |
examples=[["Write me a poem about Machine Learning."]],
|
|
|
97 |
additional_inputs_accordion=gr.Accordion(
|
98 |
label="โ๏ธ Parameters", open=False, render=False
|
99 |
),
|
|
|
115 |
title="Chat With LLMs",
|
116 |
description="Now Running [microsoft/Phi-3-mini-128k-instruct](https://huggingface.co/microsoft/Phi-3-mini-128k-instruct)",
|
117 |
)
|
118 |
+
|
119 |
+
# Gradio ์ธํฐํ์ด์ค ์คํ
|
120 |
+
demo.launch()
|