Spaces:
Running
on
Zero
Running
on
Zero
Add config
Browse files- README.md +40 -7
- app.css +28 -0
- app.py +25 -20
- lib/__init__.py +2 -1
- lib/config.py +54 -0
- lib/generate.py +5 -10
- lib/loader.py +5 -31
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
# https://huggingface.co/docs/hub/en/spaces-config-reference
|
3 |
-
title: Text
|
4 |
-
short_description:
|
5 |
-
emoji:
|
6 |
colorFrom: blue
|
7 |
colorTo: yellow
|
8 |
sdk: gradio
|
@@ -10,8 +10,8 @@ sdk_version: 4.44.1
|
|
10 |
python_version: 3.11.9
|
11 |
app_file: app.py
|
12 |
fullWidth: false
|
13 |
-
pinned:
|
14 |
-
header:
|
15 |
license: apache-2.0
|
16 |
preload_from_hub:
|
17 |
- >-
|
@@ -26,12 +26,21 @@ preload_from_hub:
|
|
26 |
- >-
|
27 |
HuggingFaceTB/SmolLM2-1.7B-Instruct
|
28 |
config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
|
|
|
|
|
|
|
29 |
- >-
|
30 |
Qwen/Qwen2.5-0.5B-Instruct
|
31 |
config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
|
|
|
|
|
|
|
32 |
- >-
|
33 |
Qwen/Qwen2.5-Coder-1.5B-Instruct
|
34 |
config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
|
|
|
|
|
|
|
35 |
- >-
|
36 |
THUDM/glm-edge-1.5b-chat
|
37 |
config.json,generation_config.json,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json
|
@@ -39,7 +48,22 @@ preload_from_hub:
|
|
39 |
|
40 |
# text
|
41 |
|
42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
43 |
|
44 |
## Installation
|
45 |
|
@@ -47,7 +71,6 @@ Simple app for small language model inference.
|
|
47 |
# clone
|
48 |
git clone https://huggingface.co/spaces/adamelliotfields/text.git
|
49 |
cd text
|
50 |
-
git remote set-url origin https://adamelliotfields:[email protected]/spaces/adamelliotfields/text
|
51 |
|
52 |
# install
|
53 |
uv venv
|
@@ -60,6 +83,16 @@ gradio app.py
|
|
60 |
|
61 |
## Development
|
62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
See [pull requests and discussions](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions).
|
64 |
|
65 |
```sh
|
|
|
1 |
---
|
2 |
# https://huggingface.co/docs/hub/en/spaces-config-reference
|
3 |
+
title: Text
|
4 |
+
short_description: Serverless small language model inference
|
5 |
+
emoji: 🤖
|
6 |
colorFrom: blue
|
7 |
colorTo: yellow
|
8 |
sdk: gradio
|
|
|
10 |
python_version: 3.11.9
|
11 |
app_file: app.py
|
12 |
fullWidth: false
|
13 |
+
pinned: true
|
14 |
+
header: mini
|
15 |
license: apache-2.0
|
16 |
preload_from_hub:
|
17 |
- >-
|
|
|
26 |
- >-
|
27 |
HuggingFaceTB/SmolLM2-1.7B-Instruct
|
28 |
config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
|
29 |
+
- >-
|
30 |
+
ibm-granite/granite-3.0-2b-instruct
|
31 |
+
added_tokens.json,config.json,merges.txt,model-00001.safetensors,model-00002.safetensors,model.safetensors.index.json,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
|
32 |
- >-
|
33 |
Qwen/Qwen2.5-0.5B-Instruct
|
34 |
config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
|
35 |
+
- >-
|
36 |
+
Qwen/Qwen2.5-1.5B-Instruct
|
37 |
+
config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
|
38 |
- >-
|
39 |
Qwen/Qwen2.5-Coder-1.5B-Instruct
|
40 |
config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
|
41 |
+
- >-
|
42 |
+
stabilityai/stablelm-2-zephyr-1_6b
|
43 |
+
config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
|
44 |
- >-
|
45 |
THUDM/glm-edge-1.5b-chat
|
46 |
config.json,generation_config.json,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json
|
|
|
48 |
|
49 |
# text
|
50 |
|
51 |
+
Serverless small language model inference.
|
52 |
+
|
53 |
+
## Models
|
54 |
+
|
55 |
+
Ungated models under 2B parameters:
|
56 |
+
|
57 |
+
- [01-ai/Yi-Coder-1.5B-Chat](https://huggingface.co/01-ai/Yi-Coder-1.5B-Chat)
|
58 |
+
- [HuggingFaceTB/SmolLM2-135M-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM2-135M-Instruct)
|
59 |
+
- [HuggingFaceTB/SmolLM2-360M-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct)
|
60 |
+
- [HuggingFaceTB/SmolLM2-1.7B-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-Instruct)
|
61 |
+
- [ibm-granite/granite-3.0-2b-instruct](https://huggingface.co/ibm-granite/granite-3.0-2b-instruct)
|
62 |
+
- [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct)
|
63 |
+
- [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct)
|
64 |
+
- [Qwen/Qwen2.5-Coder-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct)
|
65 |
+
- [stabilityai/stablelm-2-zephyr-1_6b](https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b)
|
66 |
+
- [THUDM/glm-edge-1.5b-chat](https://huggingface.co/THUDM/glm-edge-1.5b-chat)
|
67 |
|
68 |
## Installation
|
69 |
|
|
|
71 |
# clone
|
72 |
git clone https://huggingface.co/spaces/adamelliotfields/text.git
|
73 |
cd text
|
|
|
74 |
|
75 |
# install
|
76 |
uv venv
|
|
|
83 |
|
84 |
## Development
|
85 |
|
86 |
+
### Auth
|
87 |
+
|
88 |
+
Use existing `HF_TOKEN`:
|
89 |
+
|
90 |
+
```sh
|
91 |
+
git remote set-url origin https://adamelliotfields:[email protected]/spaces/adamelliotfields/text
|
92 |
+
```
|
93 |
+
|
94 |
+
### PRs
|
95 |
+
|
96 |
See [pull requests and discussions](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions).
|
97 |
|
98 |
```sh
|
app.css
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#header {
|
2 |
+
margin-bottom: 8px !important;
|
3 |
+
}
|
4 |
+
#header > div {
|
5 |
+
display: flex;
|
6 |
+
}
|
7 |
+
#header > div > h1 > span {
|
8 |
+
font-style: italic;
|
9 |
+
color: #047857 !important;
|
10 |
+
}
|
11 |
+
#header > div > h1 > span:is(.dark *) {
|
12 |
+
color: #10b981 !important;
|
13 |
+
}
|
14 |
+
#header > div > svg {
|
15 |
+
width: 1.5rem;
|
16 |
+
height: 1.5rem;
|
17 |
+
margin-top: 0.25rem;
|
18 |
+
margin-left: 0.5rem;
|
19 |
+
align-self: center;
|
20 |
+
fill: #047857 !important;
|
21 |
+
animation: spin 3s linear infinite reverse;
|
22 |
+
}
|
23 |
+
#header > div > svg:is(.dark *) {
|
24 |
+
fill: #10b981 !important;
|
25 |
+
}
|
26 |
+
@keyframes spin {
|
27 |
+
100% { transform: rotate(360deg); }
|
28 |
+
}
|
app.py
CHANGED
@@ -4,7 +4,7 @@ import gradio as gr
|
|
4 |
import numpy as np
|
5 |
import torch
|
6 |
|
7 |
-
from lib import generate
|
8 |
|
9 |
HEAD = """
|
10 |
<style>
|
@@ -14,8 +14,16 @@ HEAD = """
|
|
14 |
</style>
|
15 |
"""
|
16 |
|
17 |
-
|
18 |
-
<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
"""
|
20 |
|
21 |
SEED = 0
|
@@ -26,33 +34,28 @@ if gr.NO_RELOAD:
|
|
26 |
np.random.seed(SEED)
|
27 |
torch.manual_seed(SEED)
|
28 |
|
|
|
|
|
|
|
29 |
# https://github.com/gradio-app/gradio/blob/main/gradio/chat_interface.py
|
30 |
chat_interface = gr.ChatInterface(
|
31 |
title=None,
|
32 |
fn=generate,
|
|
|
|
|
33 |
type="messages", # interface type must match bot type
|
34 |
-
description=
|
35 |
-
chatbot=gr.Chatbot(type="messages", show_label=False, height=None, scale=1),
|
36 |
-
textbox=gr.Textbox(placeholder="Type a message...", autofocus=True, scale=7),
|
37 |
additional_inputs=[
|
38 |
gr.Textbox(
|
39 |
-
label="System
|
40 |
lines=2,
|
41 |
value="You are a helpful assistant. Be concise and precise.",
|
42 |
),
|
43 |
gr.Dropdown(
|
44 |
label="Model",
|
45 |
filterable=False,
|
46 |
-
value="
|
47 |
-
choices=
|
48 |
-
"01-ai/Yi-Coder-1.5B-Chat",
|
49 |
-
"HuggingFaceTB/SmolLM2-135M-Instruct",
|
50 |
-
"HuggingFaceTB/SmolLM2-360M-Instruct",
|
51 |
-
"HuggingFaceTB/SmolLM2-1.7B-Instruct",
|
52 |
-
"Qwen/Qwen2.5-0.5B-Instruct",
|
53 |
-
"Qwen/Qwen2.5-Coder-1.5B-Instruct",
|
54 |
-
"THUDM/glm-edge-1.5b-chat",
|
55 |
-
],
|
56 |
),
|
57 |
gr.Slider(
|
58 |
label="Max new tokens",
|
@@ -71,6 +74,7 @@ chat_interface = gr.ChatInterface(
|
|
71 |
info="Modulates next token probabilities.",
|
72 |
),
|
73 |
gr.Slider(
|
|
|
74 |
label="Repetition penalty",
|
75 |
minimum=1.0,
|
76 |
maximum=2.0,
|
@@ -79,6 +83,7 @@ chat_interface = gr.ChatInterface(
|
|
79 |
info="Penalizes repeating tokens.",
|
80 |
),
|
81 |
gr.Slider(
|
|
|
82 |
label="Top-p",
|
83 |
minimum=0.05,
|
84 |
maximum=1.0,
|
@@ -87,6 +92,7 @@ chat_interface = gr.ChatInterface(
|
|
87 |
info="Only tokens with cumulative probability p are considered (nucleus sampling).",
|
88 |
),
|
89 |
gr.Slider(
|
|
|
90 |
label="Top-k",
|
91 |
minimum=1,
|
92 |
maximum=100,
|
@@ -97,9 +103,8 @@ chat_interface = gr.ChatInterface(
|
|
97 |
],
|
98 |
)
|
99 |
|
100 |
-
|
101 |
-
|
102 |
-
gr.HTML(TITLE)
|
103 |
chat_interface.render()
|
104 |
|
105 |
if __name__ == "__main__":
|
|
|
4 |
import numpy as np
|
5 |
import torch
|
6 |
|
7 |
+
from lib import CONFIG, generate
|
8 |
|
9 |
HEAD = """
|
10 |
<style>
|
|
|
14 |
</style>
|
15 |
"""
|
16 |
|
17 |
+
HEADER = """
|
18 |
+
<div id="header">
|
19 |
+
<div>
|
20 |
+
<h1>Text</h1>
|
21 |
+
<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 15 15">
|
22 |
+
<path d="M7.48877 6.75C7.29015 6.75 7.09967 6.82902 6.95923 6.96967C6.81879 7.11032 6.73989 7.30109 6.73989 7.5C6.73989 7.69891 6.81879 7.88968 6.95923 8.03033C7.09967 8.17098 7.29015 8.25 7.48877 8.25C7.68738 8.25 7.87786 8.17098 8.0183 8.03033C8.15874 7.88968 8.23764 7.69891 8.23764 7.5C8.23764 7.30109 8.15874 7.11032 8.0183 6.96967C7.87786 6.82902 7.68738 6.75 7.48877 6.75ZM7.8632 0C11.2331 0 11.3155 2.6775 9.54818 3.5625C8.80679 3.93 8.47728 4.7175 8.335 5.415C8.69446 5.565 9.00899 5.7975 9.24863 6.0975C12.0195 4.5975 15 5.19 15 7.875C15 11.25 12.3265 11.325 11.4428 9.5475C11.0684 8.805 10.2746 8.475 9.57813 8.3325C9.42836 8.6925 9.19621 9 8.89665 9.255C10.3869 12.0225 9.79531 15 7.11433 15C3.74438 15 3.67698 12.315 5.44433 11.43C6.17823 11.0625 6.50774 10.2825 6.65751 9.5925C6.29056 9.4425 5.96855 9.2025 5.72891 8.9025C2.96555 10.3875 0 9.8025 0 7.125C0 3.75 2.666 3.6675 3.54967 5.445C3.92411 6.1875 4.71043 6.51 5.40689 6.6525C5.54918 6.2925 5.78882 5.9775 6.09586 5.7375C4.60559 2.97 5.1972 0 7.8632 0Z"></path>
|
23 |
+
</svg>
|
24 |
+
</div>
|
25 |
+
<p>Serverless small language model inference.</p>
|
26 |
+
</div>
|
27 |
"""
|
28 |
|
29 |
SEED = 0
|
|
|
34 |
np.random.seed(SEED)
|
35 |
torch.manual_seed(SEED)
|
36 |
|
37 |
+
chatbot = gr.Chatbot(type="messages", show_label=False, height=None, scale=1)
|
38 |
+
textbox = gr.Textbox(placeholder="Type a message...", autofocus=True, scale=7)
|
39 |
+
|
40 |
# https://github.com/gradio-app/gradio/blob/main/gradio/chat_interface.py
|
41 |
chat_interface = gr.ChatInterface(
|
42 |
title=None,
|
43 |
fn=generate,
|
44 |
+
chatbot=chatbot,
|
45 |
+
textbox=textbox,
|
46 |
type="messages", # interface type must match bot type
|
47 |
+
description=None,
|
|
|
|
|
48 |
additional_inputs=[
|
49 |
gr.Textbox(
|
50 |
+
label="System Message",
|
51 |
lines=2,
|
52 |
value="You are a helpful assistant. Be concise and precise.",
|
53 |
),
|
54 |
gr.Dropdown(
|
55 |
label="Model",
|
56 |
filterable=False,
|
57 |
+
value="Qwen/Qwen2.5-0.5B-Instruct",
|
58 |
+
choices=list(CONFIG.keys()),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
),
|
60 |
gr.Slider(
|
61 |
label="Max new tokens",
|
|
|
74 |
info="Modulates next token probabilities.",
|
75 |
),
|
76 |
gr.Slider(
|
77 |
+
# https://arxiv.org/abs/1909.05858
|
78 |
label="Repetition penalty",
|
79 |
minimum=1.0,
|
80 |
maximum=2.0,
|
|
|
83 |
info="Penalizes repeating tokens.",
|
84 |
),
|
85 |
gr.Slider(
|
86 |
+
# https://arxiv.org/abs/1904.09751
|
87 |
label="Top-p",
|
88 |
minimum=0.05,
|
89 |
maximum=1.0,
|
|
|
92 |
info="Only tokens with cumulative probability p are considered (nucleus sampling).",
|
93 |
),
|
94 |
gr.Slider(
|
95 |
+
# https://arxiv.org/pdf/1805.04833
|
96 |
label="Top-k",
|
97 |
minimum=1,
|
98 |
maximum=100,
|
|
|
103 |
],
|
104 |
)
|
105 |
|
106 |
+
with gr.Blocks(head=HEAD, css="./app.css", fill_height=True) as demo:
|
107 |
+
gr.HTML(HEADER)
|
|
|
108 |
chat_interface.render()
|
109 |
|
110 |
if __name__ == "__main__":
|
lib/__init__.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
from .generate import generate
|
2 |
|
3 |
-
__all__ = ["generate"]
|
|
|
1 |
+
from .config import CONFIG
|
2 |
from .generate import generate
|
3 |
|
4 |
+
__all__ = ["CONFIG", "generate"]
|
lib/config.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers import (
|
2 |
+
GlmForCausalLM,
|
3 |
+
GPT2TokenizerFast,
|
4 |
+
GraniteForCausalLM,
|
5 |
+
LlamaForCausalLM,
|
6 |
+
LlamaTokenizerFast,
|
7 |
+
PreTrainedTokenizerFast,
|
8 |
+
Qwen2ForCausalLM,
|
9 |
+
Qwen2TokenizerFast,
|
10 |
+
StableLmForCausalLM,
|
11 |
+
)
|
12 |
+
|
13 |
+
CONFIG = {
|
14 |
+
"01-ai/Yi-Coder-1.5B-Chat": {
|
15 |
+
"model": LlamaForCausalLM,
|
16 |
+
"tokenizer": LlamaTokenizerFast,
|
17 |
+
},
|
18 |
+
"HuggingFaceTB/SmolLM2-135M-Instruct": {
|
19 |
+
"model": LlamaForCausalLM,
|
20 |
+
"tokenizer": GPT2TokenizerFast,
|
21 |
+
},
|
22 |
+
"HuggingFaceTB/SmolLM2-360M-Instruct": {
|
23 |
+
"model": LlamaForCausalLM,
|
24 |
+
"tokenizer": GPT2TokenizerFast,
|
25 |
+
},
|
26 |
+
"HuggingFaceTB/SmolLM2-1.7B-Instruct": {
|
27 |
+
"model": LlamaForCausalLM,
|
28 |
+
"tokenizer": GPT2TokenizerFast,
|
29 |
+
},
|
30 |
+
"ibm-granite/granite-3.0-2b-instruct": {
|
31 |
+
"model": GraniteForCausalLM,
|
32 |
+
"tokenizer": GPT2TokenizerFast,
|
33 |
+
},
|
34 |
+
"Qwen/Qwen2.5-0.5B-Instruct": {
|
35 |
+
"model": Qwen2ForCausalLM,
|
36 |
+
"tokenizer": Qwen2TokenizerFast,
|
37 |
+
},
|
38 |
+
"Qwen/Qwen2.5-1.5B-Instruct": {
|
39 |
+
"model": Qwen2ForCausalLM,
|
40 |
+
"tokenizer": Qwen2TokenizerFast,
|
41 |
+
},
|
42 |
+
"Qwen/Qwen2.5-Coder-1.5B-Instruct": {
|
43 |
+
"model": Qwen2ForCausalLM,
|
44 |
+
"tokenizer": Qwen2TokenizerFast,
|
45 |
+
},
|
46 |
+
"stabilityai/stablelm-2-zephyr-1_6b": {
|
47 |
+
"model": StableLmForCausalLM,
|
48 |
+
"tokenizer": GPT2TokenizerFast,
|
49 |
+
},
|
50 |
+
"THUDM/glm-edge-1.5b-chat": {
|
51 |
+
"model": GlmForCausalLM,
|
52 |
+
"tokenizer": PreTrainedTokenizerFast,
|
53 |
+
},
|
54 |
+
}
|
lib/generate.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1 |
from threading import Thread
|
2 |
from typing import Iterator
|
3 |
|
4 |
-
import torch
|
5 |
-
from gradio import Error, Progress
|
6 |
from spaces import GPU, config
|
7 |
from transformers import TextIteratorStreamer
|
8 |
|
@@ -13,23 +11,19 @@ from .loader import get_loader
|
|
13 |
def generate(
|
14 |
message: str,
|
15 |
chat_history: list[dict[str, str]],
|
16 |
-
|
17 |
-
model="
|
18 |
max_tokens=512,
|
19 |
temperature=0.6,
|
20 |
repetition_penalty=1.2,
|
21 |
top_p=0.9,
|
22 |
top_k=50,
|
23 |
-
_=Progress(track_tqdm=True),
|
24 |
) -> Iterator[str]:
|
25 |
-
if not torch.cuda.is_available():
|
26 |
-
raise Error("CUDA not available")
|
27 |
-
|
28 |
# Prepend system prompt
|
29 |
if not chat_history or chat_history[0].get("role") != "system":
|
30 |
-
chat_history.insert(0, {"role": "system", "content":
|
31 |
else:
|
32 |
-
chat_history[0]["content"] =
|
33 |
|
34 |
# Append user message before generating
|
35 |
chat_history.append({"role": "user", "content": message})
|
@@ -83,6 +77,7 @@ def transformers_generate(
|
|
83 |
skip_special_tokens=True,
|
84 |
)
|
85 |
|
|
|
86 |
generate_kwargs = dict(
|
87 |
do_sample=True,
|
88 |
streamer=streamer,
|
|
|
1 |
from threading import Thread
|
2 |
from typing import Iterator
|
3 |
|
|
|
|
|
4 |
from spaces import GPU, config
|
5 |
from transformers import TextIteratorStreamer
|
6 |
|
|
|
11 |
def generate(
|
12 |
message: str,
|
13 |
chat_history: list[dict[str, str]],
|
14 |
+
system_message="",
|
15 |
+
model="Qwen/Qwen2.5-0.5B-Instruct",
|
16 |
max_tokens=512,
|
17 |
temperature=0.6,
|
18 |
repetition_penalty=1.2,
|
19 |
top_p=0.9,
|
20 |
top_k=50,
|
|
|
21 |
) -> Iterator[str]:
|
|
|
|
|
|
|
22 |
# Prepend system prompt
|
23 |
if not chat_history or chat_history[0].get("role") != "system":
|
24 |
+
chat_history.insert(0, {"role": "system", "content": system_message})
|
25 |
else:
|
26 |
+
chat_history[0]["content"] = system_message
|
27 |
|
28 |
# Append user message before generating
|
29 |
chat_history.append({"role": "user", "content": message})
|
|
|
77 |
skip_special_tokens=True,
|
78 |
)
|
79 |
|
80 |
+
# https://huggingface.co/blog/how-to-generate
|
81 |
generate_kwargs = dict(
|
82 |
do_sample=True,
|
83 |
streamer=streamer,
|
lib/loader.py
CHANGED
@@ -1,15 +1,8 @@
|
|
1 |
import os
|
2 |
|
3 |
import torch
|
4 |
-
|
5 |
-
|
6 |
-
GPT2Tokenizer,
|
7 |
-
LlamaForCausalLM,
|
8 |
-
LlamaTokenizer,
|
9 |
-
PreTrainedTokenizerFast,
|
10 |
-
Qwen2ForCausalLM,
|
11 |
-
Qwen2Tokenizer,
|
12 |
-
)
|
13 |
|
14 |
|
15 |
class Loader:
|
@@ -31,29 +24,10 @@ class Loader:
|
|
31 |
"low_cpu_mem_usage": True,
|
32 |
"torch_dtype": torch.bfloat16 if cuda_capability >= 8 else torch.float16,
|
33 |
}
|
34 |
-
model_fns = {
|
35 |
-
# Could have used auto-classes or a pipeline
|
36 |
-
"01-ai/Yi-Coder-1.5B-Chat": LlamaForCausalLM.from_pretrained,
|
37 |
-
"HuggingFaceTB/SmolLM2-135M-Instruct": LlamaForCausalLM.from_pretrained,
|
38 |
-
"HuggingFaceTB/SmolLM2-360M-Instruct": LlamaForCausalLM.from_pretrained,
|
39 |
-
"HuggingFaceTB/SmolLM2-1.7B-Instruct": LlamaForCausalLM.from_pretrained,
|
40 |
-
"Qwen/Qwen2.5-0.5B-Instruct": Qwen2ForCausalLM.from_pretrained,
|
41 |
-
"Qwen/Qwen2.5-Coder-1.5B-Instruct": Qwen2ForCausalLM.from_pretrained,
|
42 |
-
"THUDM/glm-edge-1.5b-chat": GlmForCausalLM.from_pretrained,
|
43 |
-
}
|
44 |
-
model_tokenizers = {
|
45 |
-
"01-ai/Yi-Coder-1.5B-Chat": LlamaTokenizer,
|
46 |
-
"HuggingFaceTB/SmolLM2-135M-Instruct": GPT2Tokenizer,
|
47 |
-
"HuggingFaceTB/SmolLM2-360M-Instruct": GPT2Tokenizer,
|
48 |
-
"HuggingFaceTB/SmolLM2-1.7B-Instruct": GPT2Tokenizer,
|
49 |
-
"Qwen/Qwen2.5-0.5B-Instruct": Qwen2Tokenizer,
|
50 |
-
"Qwen/Qwen2.5-Coder-1.5B-Instruct": Qwen2Tokenizer,
|
51 |
-
"THUDM/glm-edge-1.5b-chat": PreTrainedTokenizerFast,
|
52 |
-
}
|
53 |
|
54 |
-
|
55 |
-
self.tokenizer =
|
56 |
-
self.llm =
|
57 |
self.llm.eval()
|
58 |
self.model = model
|
59 |
|
|
|
1 |
import os
|
2 |
|
3 |
import torch
|
4 |
+
|
5 |
+
from .config import CONFIG
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
|
7 |
|
8 |
class Loader:
|
|
|
24 |
"low_cpu_mem_usage": True,
|
25 |
"torch_dtype": torch.bfloat16 if cuda_capability >= 8 else torch.float16,
|
26 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
27 |
|
28 |
+
config = CONFIG[model]
|
29 |
+
self.tokenizer = config["tokenizer"].from_pretrained(model)
|
30 |
+
self.llm = config["model"].from_pretrained(model, **kwargs)
|
31 |
self.llm.eval()
|
32 |
self.model = model
|
33 |
|