adamelliotfields commited on
Commit
3eb01b6
·
verified ·
1 Parent(s): 249c14b

Add config

Browse files
Files changed (7) hide show
  1. README.md +40 -7
  2. app.css +28 -0
  3. app.py +25 -20
  4. lib/__init__.py +2 -1
  5. lib/config.py +54 -0
  6. lib/generate.py +5 -10
  7. lib/loader.py +5 -31
README.md CHANGED
@@ -1,8 +1,8 @@
1
  ---
2
  # https://huggingface.co/docs/hub/en/spaces-config-reference
3
- title: Text Generation
4
- short_description: Simple app for small language model inference
5
- emoji: ⌨️
6
  colorFrom: blue
7
  colorTo: yellow
8
  sdk: gradio
@@ -10,8 +10,8 @@ sdk_version: 4.44.1
10
  python_version: 3.11.9
11
  app_file: app.py
12
  fullWidth: false
13
- pinned: false
14
- header: default
15
  license: apache-2.0
16
  preload_from_hub:
17
  - >-
@@ -26,12 +26,21 @@ preload_from_hub:
26
  - >-
27
  HuggingFaceTB/SmolLM2-1.7B-Instruct
28
  config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
 
 
 
29
  - >-
30
  Qwen/Qwen2.5-0.5B-Instruct
31
  config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
 
 
 
32
  - >-
33
  Qwen/Qwen2.5-Coder-1.5B-Instruct
34
  config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
 
 
 
35
  - >-
36
  THUDM/glm-edge-1.5b-chat
37
  config.json,generation_config.json,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json
@@ -39,7 +48,22 @@ preload_from_hub:
39
 
40
  # text
41
 
42
- Simple app for small language model inference.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  ## Installation
45
 
@@ -47,7 +71,6 @@ Simple app for small language model inference.
47
  # clone
48
  git clone https://huggingface.co/spaces/adamelliotfields/text.git
49
  cd text
50
- git remote set-url origin https://adamelliotfields:[email protected]/spaces/adamelliotfields/text
51
 
52
  # install
53
  uv venv
@@ -60,6 +83,16 @@ gradio app.py
60
 
61
  ## Development
62
 
 
 
 
 
 
 
 
 
 
 
63
  See [pull requests and discussions](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions).
64
 
65
  ```sh
 
1
  ---
2
  # https://huggingface.co/docs/hub/en/spaces-config-reference
3
+ title: Text
4
+ short_description: Serverless small language model inference
5
+ emoji: 🤖
6
  colorFrom: blue
7
  colorTo: yellow
8
  sdk: gradio
 
10
  python_version: 3.11.9
11
  app_file: app.py
12
  fullWidth: false
13
+ pinned: true
14
+ header: mini
15
  license: apache-2.0
16
  preload_from_hub:
17
  - >-
 
26
  - >-
27
  HuggingFaceTB/SmolLM2-1.7B-Instruct
28
  config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
29
+ - >-
30
+ ibm-granite/granite-3.0-2b-instruct
31
+ added_tokens.json,config.json,merges.txt,model-00001.safetensors,model-00002.safetensors,model.safetensors.index.json,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
32
  - >-
33
  Qwen/Qwen2.5-0.5B-Instruct
34
  config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
35
+ - >-
36
+ Qwen/Qwen2.5-1.5B-Instruct
37
+ config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
38
  - >-
39
  Qwen/Qwen2.5-Coder-1.5B-Instruct
40
  config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
41
+ - >-
42
+ stabilityai/stablelm-2-zephyr-1_6b
43
+ config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
44
  - >-
45
  THUDM/glm-edge-1.5b-chat
46
  config.json,generation_config.json,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json
 
48
 
49
  # text
50
 
51
+ Serverless small language model inference.
52
+
53
+ ## Models
54
+
55
+ Ungated models under 2B parameters:
56
+
57
+ - [01-ai/Yi-Coder-1.5B-Chat](https://huggingface.co/01-ai/Yi-Coder-1.5B-Chat)
58
+ - [HuggingFaceTB/SmolLM2-135M-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM2-135M-Instruct)
59
+ - [HuggingFaceTB/SmolLM2-360M-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct)
60
+ - [HuggingFaceTB/SmolLM2-1.7B-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-Instruct)
61
+ - [ibm-granite/granite-3.0-2b-instruct](https://huggingface.co/ibm-granite/granite-3.0-2b-instruct)
62
+ - [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct)
63
+ - [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct)
64
+ - [Qwen/Qwen2.5-Coder-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct)
65
+ - [stabilityai/stablelm-2-zephyr-1_6b](https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b)
66
+ - [THUDM/glm-edge-1.5b-chat](https://huggingface.co/THUDM/glm-edge-1.5b-chat)
67
 
68
  ## Installation
69
 
 
71
  # clone
72
  git clone https://huggingface.co/spaces/adamelliotfields/text.git
73
  cd text
 
74
 
75
  # install
76
  uv venv
 
83
 
84
  ## Development
85
 
86
+ ### Auth
87
+
88
+ Use existing `HF_TOKEN`:
89
+
90
+ ```sh
91
+ git remote set-url origin https://adamelliotfields:[email protected]/spaces/adamelliotfields/text
92
+ ```
93
+
94
+ ### PRs
95
+
96
  See [pull requests and discussions](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions).
97
 
98
  ```sh
app.css ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #header {
2
+ margin-bottom: 8px !important;
3
+ }
4
+ #header > div {
5
+ display: flex;
6
+ }
7
+ #header > div > h1 > span {
8
+ font-style: italic;
9
+ color: #047857 !important;
10
+ }
11
+ #header > div > h1 > span:is(.dark *) {
12
+ color: #10b981 !important;
13
+ }
14
+ #header > div > svg {
15
+ width: 1.5rem;
16
+ height: 1.5rem;
17
+ margin-top: 0.25rem;
18
+ margin-left: 0.5rem;
19
+ align-self: center;
20
+ fill: #047857 !important;
21
+ animation: spin 3s linear infinite reverse;
22
+ }
23
+ #header > div > svg:is(.dark *) {
24
+ fill: #10b981 !important;
25
+ }
26
+ @keyframes spin {
27
+ 100% { transform: rotate(360deg); }
28
+ }
app.py CHANGED
@@ -4,7 +4,7 @@ import gradio as gr
4
  import numpy as np
5
  import torch
6
 
7
- from lib import generate
8
 
9
  HEAD = """
10
  <style>
@@ -14,8 +14,16 @@ HEAD = """
14
  </style>
15
  """
16
 
17
- TITLE = """
18
- <h1>Text Generation</h1>
 
 
 
 
 
 
 
 
19
  """
20
 
21
  SEED = 0
@@ -26,33 +34,28 @@ if gr.NO_RELOAD:
26
  np.random.seed(SEED)
27
  torch.manual_seed(SEED)
28
 
 
 
 
29
  # https://github.com/gradio-app/gradio/blob/main/gradio/chat_interface.py
30
  chat_interface = gr.ChatInterface(
31
  title=None,
32
  fn=generate,
 
 
33
  type="messages", # interface type must match bot type
34
- description="Simple app for small language model inference.",
35
- chatbot=gr.Chatbot(type="messages", show_label=False, height=None, scale=1),
36
- textbox=gr.Textbox(placeholder="Type a message...", autofocus=True, scale=7),
37
  additional_inputs=[
38
  gr.Textbox(
39
- label="System Prompt",
40
  lines=2,
41
  value="You are a helpful assistant. Be concise and precise.",
42
  ),
43
  gr.Dropdown(
44
  label="Model",
45
  filterable=False,
46
- value="HuggingFaceTB/SmolLM2-135M-Instruct",
47
- choices=[
48
- "01-ai/Yi-Coder-1.5B-Chat",
49
- "HuggingFaceTB/SmolLM2-135M-Instruct",
50
- "HuggingFaceTB/SmolLM2-360M-Instruct",
51
- "HuggingFaceTB/SmolLM2-1.7B-Instruct",
52
- "Qwen/Qwen2.5-0.5B-Instruct",
53
- "Qwen/Qwen2.5-Coder-1.5B-Instruct",
54
- "THUDM/glm-edge-1.5b-chat",
55
- ],
56
  ),
57
  gr.Slider(
58
  label="Max new tokens",
@@ -71,6 +74,7 @@ chat_interface = gr.ChatInterface(
71
  info="Modulates next token probabilities.",
72
  ),
73
  gr.Slider(
 
74
  label="Repetition penalty",
75
  minimum=1.0,
76
  maximum=2.0,
@@ -79,6 +83,7 @@ chat_interface = gr.ChatInterface(
79
  info="Penalizes repeating tokens.",
80
  ),
81
  gr.Slider(
 
82
  label="Top-p",
83
  minimum=0.05,
84
  maximum=1.0,
@@ -87,6 +92,7 @@ chat_interface = gr.ChatInterface(
87
  info="Only tokens with cumulative probability p are considered (nucleus sampling).",
88
  ),
89
  gr.Slider(
 
90
  label="Top-k",
91
  minimum=1,
92
  maximum=100,
@@ -97,9 +103,8 @@ chat_interface = gr.ChatInterface(
97
  ],
98
  )
99
 
100
-
101
- with gr.Blocks(head=HEAD, fill_height=True) as demo:
102
- gr.HTML(TITLE)
103
  chat_interface.render()
104
 
105
  if __name__ == "__main__":
 
4
  import numpy as np
5
  import torch
6
 
7
+ from lib import CONFIG, generate
8
 
9
  HEAD = """
10
  <style>
 
14
  </style>
15
  """
16
 
17
+ HEADER = """
18
+ <div id="header">
19
+ <div>
20
+ <h1>Text</h1>
21
+ <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 15 15">
22
+ <path d="M7.48877 6.75C7.29015 6.75 7.09967 6.82902 6.95923 6.96967C6.81879 7.11032 6.73989 7.30109 6.73989 7.5C6.73989 7.69891 6.81879 7.88968 6.95923 8.03033C7.09967 8.17098 7.29015 8.25 7.48877 8.25C7.68738 8.25 7.87786 8.17098 8.0183 8.03033C8.15874 7.88968 8.23764 7.69891 8.23764 7.5C8.23764 7.30109 8.15874 7.11032 8.0183 6.96967C7.87786 6.82902 7.68738 6.75 7.48877 6.75ZM7.8632 0C11.2331 0 11.3155 2.6775 9.54818 3.5625C8.80679 3.93 8.47728 4.7175 8.335 5.415C8.69446 5.565 9.00899 5.7975 9.24863 6.0975C12.0195 4.5975 15 5.19 15 7.875C15 11.25 12.3265 11.325 11.4428 9.5475C11.0684 8.805 10.2746 8.475 9.57813 8.3325C9.42836 8.6925 9.19621 9 8.89665 9.255C10.3869 12.0225 9.79531 15 7.11433 15C3.74438 15 3.67698 12.315 5.44433 11.43C6.17823 11.0625 6.50774 10.2825 6.65751 9.5925C6.29056 9.4425 5.96855 9.2025 5.72891 8.9025C2.96555 10.3875 0 9.8025 0 7.125C0 3.75 2.666 3.6675 3.54967 5.445C3.92411 6.1875 4.71043 6.51 5.40689 6.6525C5.54918 6.2925 5.78882 5.9775 6.09586 5.7375C4.60559 2.97 5.1972 0 7.8632 0Z"></path>
23
+ </svg>
24
+ </div>
25
+ <p>Serverless small language model inference.</p>
26
+ </div>
27
  """
28
 
29
  SEED = 0
 
34
  np.random.seed(SEED)
35
  torch.manual_seed(SEED)
36
 
37
+ chatbot = gr.Chatbot(type="messages", show_label=False, height=None, scale=1)
38
+ textbox = gr.Textbox(placeholder="Type a message...", autofocus=True, scale=7)
39
+
40
  # https://github.com/gradio-app/gradio/blob/main/gradio/chat_interface.py
41
  chat_interface = gr.ChatInterface(
42
  title=None,
43
  fn=generate,
44
+ chatbot=chatbot,
45
+ textbox=textbox,
46
  type="messages", # interface type must match bot type
47
+ description=None,
 
 
48
  additional_inputs=[
49
  gr.Textbox(
50
+ label="System Message",
51
  lines=2,
52
  value="You are a helpful assistant. Be concise and precise.",
53
  ),
54
  gr.Dropdown(
55
  label="Model",
56
  filterable=False,
57
+ value="Qwen/Qwen2.5-0.5B-Instruct",
58
+ choices=list(CONFIG.keys()),
 
 
 
 
 
 
 
 
59
  ),
60
  gr.Slider(
61
  label="Max new tokens",
 
74
  info="Modulates next token probabilities.",
75
  ),
76
  gr.Slider(
77
+ # https://arxiv.org/abs/1909.05858
78
  label="Repetition penalty",
79
  minimum=1.0,
80
  maximum=2.0,
 
83
  info="Penalizes repeating tokens.",
84
  ),
85
  gr.Slider(
86
+ # https://arxiv.org/abs/1904.09751
87
  label="Top-p",
88
  minimum=0.05,
89
  maximum=1.0,
 
92
  info="Only tokens with cumulative probability p are considered (nucleus sampling).",
93
  ),
94
  gr.Slider(
95
+ # https://arxiv.org/pdf/1805.04833
96
  label="Top-k",
97
  minimum=1,
98
  maximum=100,
 
103
  ],
104
  )
105
 
106
+ with gr.Blocks(head=HEAD, css="./app.css", fill_height=True) as demo:
107
+ gr.HTML(HEADER)
 
108
  chat_interface.render()
109
 
110
  if __name__ == "__main__":
lib/__init__.py CHANGED
@@ -1,3 +1,4 @@
 
1
  from .generate import generate
2
 
3
- __all__ = ["generate"]
 
1
+ from .config import CONFIG
2
  from .generate import generate
3
 
4
+ __all__ = ["CONFIG", "generate"]
lib/config.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ GlmForCausalLM,
3
+ GPT2TokenizerFast,
4
+ GraniteForCausalLM,
5
+ LlamaForCausalLM,
6
+ LlamaTokenizerFast,
7
+ PreTrainedTokenizerFast,
8
+ Qwen2ForCausalLM,
9
+ Qwen2TokenizerFast,
10
+ StableLmForCausalLM,
11
+ )
12
+
13
+ CONFIG = {
14
+ "01-ai/Yi-Coder-1.5B-Chat": {
15
+ "model": LlamaForCausalLM,
16
+ "tokenizer": LlamaTokenizerFast,
17
+ },
18
+ "HuggingFaceTB/SmolLM2-135M-Instruct": {
19
+ "model": LlamaForCausalLM,
20
+ "tokenizer": GPT2TokenizerFast,
21
+ },
22
+ "HuggingFaceTB/SmolLM2-360M-Instruct": {
23
+ "model": LlamaForCausalLM,
24
+ "tokenizer": GPT2TokenizerFast,
25
+ },
26
+ "HuggingFaceTB/SmolLM2-1.7B-Instruct": {
27
+ "model": LlamaForCausalLM,
28
+ "tokenizer": GPT2TokenizerFast,
29
+ },
30
+ "ibm-granite/granite-3.0-2b-instruct": {
31
+ "model": GraniteForCausalLM,
32
+ "tokenizer": GPT2TokenizerFast,
33
+ },
34
+ "Qwen/Qwen2.5-0.5B-Instruct": {
35
+ "model": Qwen2ForCausalLM,
36
+ "tokenizer": Qwen2TokenizerFast,
37
+ },
38
+ "Qwen/Qwen2.5-1.5B-Instruct": {
39
+ "model": Qwen2ForCausalLM,
40
+ "tokenizer": Qwen2TokenizerFast,
41
+ },
42
+ "Qwen/Qwen2.5-Coder-1.5B-Instruct": {
43
+ "model": Qwen2ForCausalLM,
44
+ "tokenizer": Qwen2TokenizerFast,
45
+ },
46
+ "stabilityai/stablelm-2-zephyr-1_6b": {
47
+ "model": StableLmForCausalLM,
48
+ "tokenizer": GPT2TokenizerFast,
49
+ },
50
+ "THUDM/glm-edge-1.5b-chat": {
51
+ "model": GlmForCausalLM,
52
+ "tokenizer": PreTrainedTokenizerFast,
53
+ },
54
+ }
lib/generate.py CHANGED
@@ -1,8 +1,6 @@
1
  from threading import Thread
2
  from typing import Iterator
3
 
4
- import torch
5
- from gradio import Error, Progress
6
  from spaces import GPU, config
7
  from transformers import TextIteratorStreamer
8
 
@@ -13,23 +11,19 @@ from .loader import get_loader
13
  def generate(
14
  message: str,
15
  chat_history: list[dict[str, str]],
16
- system_prompt="",
17
- model="HuggingFaceTB/SmolLM2-135M-Instruct",
18
  max_tokens=512,
19
  temperature=0.6,
20
  repetition_penalty=1.2,
21
  top_p=0.9,
22
  top_k=50,
23
- _=Progress(track_tqdm=True),
24
  ) -> Iterator[str]:
25
- if not torch.cuda.is_available():
26
- raise Error("CUDA not available")
27
-
28
  # Prepend system prompt
29
  if not chat_history or chat_history[0].get("role") != "system":
30
- chat_history.insert(0, {"role": "system", "content": system_prompt})
31
  else:
32
- chat_history[0]["content"] = system_prompt
33
 
34
  # Append user message before generating
35
  chat_history.append({"role": "user", "content": message})
@@ -83,6 +77,7 @@ def transformers_generate(
83
  skip_special_tokens=True,
84
  )
85
 
 
86
  generate_kwargs = dict(
87
  do_sample=True,
88
  streamer=streamer,
 
1
  from threading import Thread
2
  from typing import Iterator
3
 
 
 
4
  from spaces import GPU, config
5
  from transformers import TextIteratorStreamer
6
 
 
11
  def generate(
12
  message: str,
13
  chat_history: list[dict[str, str]],
14
+ system_message="",
15
+ model="Qwen/Qwen2.5-0.5B-Instruct",
16
  max_tokens=512,
17
  temperature=0.6,
18
  repetition_penalty=1.2,
19
  top_p=0.9,
20
  top_k=50,
 
21
  ) -> Iterator[str]:
 
 
 
22
  # Prepend system prompt
23
  if not chat_history or chat_history[0].get("role") != "system":
24
+ chat_history.insert(0, {"role": "system", "content": system_message})
25
  else:
26
+ chat_history[0]["content"] = system_message
27
 
28
  # Append user message before generating
29
  chat_history.append({"role": "user", "content": message})
 
77
  skip_special_tokens=True,
78
  )
79
 
80
+ # https://huggingface.co/blog/how-to-generate
81
  generate_kwargs = dict(
82
  do_sample=True,
83
  streamer=streamer,
lib/loader.py CHANGED
@@ -1,15 +1,8 @@
1
  import os
2
 
3
  import torch
4
- from transformers import (
5
- GlmForCausalLM,
6
- GPT2Tokenizer,
7
- LlamaForCausalLM,
8
- LlamaTokenizer,
9
- PreTrainedTokenizerFast,
10
- Qwen2ForCausalLM,
11
- Qwen2Tokenizer,
12
- )
13
 
14
 
15
  class Loader:
@@ -31,29 +24,10 @@ class Loader:
31
  "low_cpu_mem_usage": True,
32
  "torch_dtype": torch.bfloat16 if cuda_capability >= 8 else torch.float16,
33
  }
34
- model_fns = {
35
- # Could have used auto-classes or a pipeline
36
- "01-ai/Yi-Coder-1.5B-Chat": LlamaForCausalLM.from_pretrained,
37
- "HuggingFaceTB/SmolLM2-135M-Instruct": LlamaForCausalLM.from_pretrained,
38
- "HuggingFaceTB/SmolLM2-360M-Instruct": LlamaForCausalLM.from_pretrained,
39
- "HuggingFaceTB/SmolLM2-1.7B-Instruct": LlamaForCausalLM.from_pretrained,
40
- "Qwen/Qwen2.5-0.5B-Instruct": Qwen2ForCausalLM.from_pretrained,
41
- "Qwen/Qwen2.5-Coder-1.5B-Instruct": Qwen2ForCausalLM.from_pretrained,
42
- "THUDM/glm-edge-1.5b-chat": GlmForCausalLM.from_pretrained,
43
- }
44
- model_tokenizers = {
45
- "01-ai/Yi-Coder-1.5B-Chat": LlamaTokenizer,
46
- "HuggingFaceTB/SmolLM2-135M-Instruct": GPT2Tokenizer,
47
- "HuggingFaceTB/SmolLM2-360M-Instruct": GPT2Tokenizer,
48
- "HuggingFaceTB/SmolLM2-1.7B-Instruct": GPT2Tokenizer,
49
- "Qwen/Qwen2.5-0.5B-Instruct": Qwen2Tokenizer,
50
- "Qwen/Qwen2.5-Coder-1.5B-Instruct": Qwen2Tokenizer,
51
- "THUDM/glm-edge-1.5b-chat": PreTrainedTokenizerFast,
52
- }
53
 
54
- llm_fn = model_fns[model]
55
- self.tokenizer = model_tokenizers[model].from_pretrained(model)
56
- self.llm = llm_fn(model, **kwargs)
57
  self.llm.eval()
58
  self.model = model
59
 
 
1
  import os
2
 
3
  import torch
4
+
5
+ from .config import CONFIG
 
 
 
 
 
 
 
6
 
7
 
8
  class Loader:
 
24
  "low_cpu_mem_usage": True,
25
  "torch_dtype": torch.bfloat16 if cuda_capability >= 8 else torch.float16,
26
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
+ config = CONFIG[model]
29
+ self.tokenizer = config["tokenizer"].from_pretrained(model)
30
+ self.llm = config["model"].from_pretrained(model, **kwargs)
31
  self.llm.eval()
32
  self.model = model
33