Spaces:

adamelliotfields
/

chat

Running on Zero

App Files Files Community

adamelliotfields commited on Dec 7, 2024

Commit

3eb01b6

verified ·

1 Parent(s): 249c14b

Add config

Browse files

Files changed (7) hide show

README.md +40 -7
app.css +28 -0
app.py +25 -20
lib/__init__.py +2 -1
lib/config.py +54 -0
lib/generate.py +5 -10
lib/loader.py +5 -31

README.md CHANGED Viewed

@@ -1,8 +1,8 @@
 ---
 # https://huggingface.co/docs/hub/en/spaces-config-reference
-title: Text Generation
-short_description: Simple app for small language model inference
-emoji: ⌨️
 colorFrom: blue
 colorTo: yellow
 sdk: gradio
@@ -10,8 +10,8 @@ sdk_version: 4.44.1
 python_version: 3.11.9
 app_file: app.py
 fullWidth: false
-pinned: false
-header: default
 license: apache-2.0
 preload_from_hub:
   - >-
@@ -26,12 +26,21 @@ preload_from_hub:
   - >-
     HuggingFaceTB/SmolLM2-1.7B-Instruct
     config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
   - >-
     Qwen/Qwen2.5-0.5B-Instruct
     config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
   - >-
     Qwen/Qwen2.5-Coder-1.5B-Instruct
     config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
   - >-
     THUDM/glm-edge-1.5b-chat
     config.json,generation_config.json,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json
@@ -39,7 +48,22 @@ preload_from_hub:
 # text
-Simple app for small language model inference.
 ## Installation
@@ -47,7 +71,6 @@ Simple app for small language model inference.
 # clone
 git clone https://huggingface.co/spaces/adamelliotfields/text.git
 cd text
-git remote set-url origin https://adamelliotfields:[email protected]/spaces/adamelliotfields/text
 # install
 uv venv
@@ -60,6 +83,16 @@ gradio app.py
 ## Development
 See [pull requests and discussions](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions).
 ```sh

 ---
 # https://huggingface.co/docs/hub/en/spaces-config-reference
+title: Text
+short_description: Serverless small language model inference
+emoji: 🤖
 colorFrom: blue
 colorTo: yellow
 sdk: gradio
 python_version: 3.11.9
 app_file: app.py
 fullWidth: false
+pinned: true
+header: mini
 license: apache-2.0
 preload_from_hub:
   - >-
   - >-
     HuggingFaceTB/SmolLM2-1.7B-Instruct
     config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
+  - >-
+    ibm-granite/granite-3.0-2b-instruct
+    added_tokens.json,config.json,merges.txt,model-00001.safetensors,model-00002.safetensors,model.safetensors.index.json,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
   - >-
     Qwen/Qwen2.5-0.5B-Instruct
     config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
+  - >-
+    Qwen/Qwen2.5-1.5B-Instruct
+    config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
   - >-
     Qwen/Qwen2.5-Coder-1.5B-Instruct
     config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
+  - >-
+    stabilityai/stablelm-2-zephyr-1_6b
+    config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
   - >-
     THUDM/glm-edge-1.5b-chat
     config.json,generation_config.json,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json
 # text
+Serverless small language model inference.
+## Models
+Ungated models under 2B parameters:
+- [01-ai/Yi-Coder-1.5B-Chat](https://huggingface.co/01-ai/Yi-Coder-1.5B-Chat)
+- [HuggingFaceTB/SmolLM2-135M-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM2-135M-Instruct)
+- [HuggingFaceTB/SmolLM2-360M-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM2-360M-Instruct)
+- [HuggingFaceTB/SmolLM2-1.7B-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM2-1.7B-Instruct)
+- [ibm-granite/granite-3.0-2b-instruct](https://huggingface.co/ibm-granite/granite-3.0-2b-instruct)
+- [Qwen/Qwen2.5-0.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct)
+- [Qwen/Qwen2.5-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct)
+- [Qwen/Qwen2.5-Coder-1.5B-Instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct)
+- [stabilityai/stablelm-2-zephyr-1_6b](https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b)
+- [THUDM/glm-edge-1.5b-chat](https://huggingface.co/THUDM/glm-edge-1.5b-chat)
 ## Installation
 # clone
 git clone https://huggingface.co/spaces/adamelliotfields/text.git
 cd text
 # install
 uv venv
 ## Development
+### Auth
+Use existing `HF_TOKEN`:
+```sh
+git remote set-url origin https://adamelliotfields:[email protected]/spaces/adamelliotfields/text
+```
+### PRs
 See [pull requests and discussions](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions).
 ```sh

app.css ADDED Viewed

	@@ -0,0 +1,28 @@

+#header {
+  margin-bottom: 8px !important;
+}
+#header > div {
+  display: flex;
+}
+#header > div > h1 > span {
+  font-style: italic;
+  color: #047857 !important;
+}
+#header > div > h1 > span:is(.dark *) {
+  color: #10b981 !important;
+}
+#header > div > svg {
+  width: 1.5rem;
+  height: 1.5rem;
+  margin-top: 0.25rem;
+  margin-left: 0.5rem;
+  align-self: center;
+  fill: #047857 !important;
+  animation: spin 3s linear infinite reverse;
+}
+#header > div > svg:is(.dark *) {
+  fill: #10b981 !important;
+}
+@keyframes spin {
+  100% { transform: rotate(360deg); }
+}

app.py CHANGED Viewed

@@ -4,7 +4,7 @@ import gradio as gr
 import numpy as np
 import torch
-from lib import generate
 HEAD = """
 <style>
@@ -14,8 +14,16 @@ HEAD = """
 </style>
 """
-TITLE = """
-<h1>Text Generation</h1>
 """
 SEED = 0
@@ -26,33 +34,28 @@ if gr.NO_RELOAD:
     np.random.seed(SEED)
     torch.manual_seed(SEED)
 # https://github.com/gradio-app/gradio/blob/main/gradio/chat_interface.py
 chat_interface = gr.ChatInterface(
     title=None,
     fn=generate,
     type="messages",  # interface type must match bot type
-    description="Simple app for small language model inference.",
-    chatbot=gr.Chatbot(type="messages", show_label=False, height=None, scale=1),
-    textbox=gr.Textbox(placeholder="Type a message...", autofocus=True, scale=7),
     additional_inputs=[
         gr.Textbox(
-            label="System Prompt",
             lines=2,
             value="You are a helpful assistant. Be concise and precise.",
         ),
         gr.Dropdown(
             label="Model",
             filterable=False,
-            value="HuggingFaceTB/SmolLM2-135M-Instruct",
-            choices=[
-                "01-ai/Yi-Coder-1.5B-Chat",
-                "HuggingFaceTB/SmolLM2-135M-Instruct",
-                "HuggingFaceTB/SmolLM2-360M-Instruct",
-                "HuggingFaceTB/SmolLM2-1.7B-Instruct",
-                "Qwen/Qwen2.5-0.5B-Instruct",
-                "Qwen/Qwen2.5-Coder-1.5B-Instruct",
-                "THUDM/glm-edge-1.5b-chat",
-            ],
         ),
         gr.Slider(
             label="Max new tokens",
@@ -71,6 +74,7 @@ chat_interface = gr.ChatInterface(
             info="Modulates next token probabilities.",
         ),
         gr.Slider(
             label="Repetition penalty",
             minimum=1.0,
             maximum=2.0,
@@ -79,6 +83,7 @@ chat_interface = gr.ChatInterface(
             info="Penalizes repeating tokens.",
         ),
         gr.Slider(
             label="Top-p",
             minimum=0.05,
             maximum=1.0,
@@ -87,6 +92,7 @@ chat_interface = gr.ChatInterface(
             info="Only tokens with cumulative probability p are considered (nucleus sampling).",
         ),
         gr.Slider(
             label="Top-k",
             minimum=1,
             maximum=100,
@@ -97,9 +103,8 @@ chat_interface = gr.ChatInterface(
     ],
 )
-with gr.Blocks(head=HEAD, fill_height=True) as demo:
-    gr.HTML(TITLE)
     chat_interface.render()
 if __name__ == "__main__":

 import numpy as np
 import torch
+from lib import CONFIG, generate
 HEAD = """
 <style>
 </style>
 """
+HEADER = """
+<div id="header">
+    <div>
+        <h1>Text</h1>
+        <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 15 15">
+            <path d="M7.48877 6.75C7.29015 6.75 7.09967 6.82902 6.95923 6.96967C6.81879 7.11032 6.73989 7.30109 6.73989 7.5C6.73989 7.69891 6.81879 7.88968 6.95923 8.03033C7.09967 8.17098 7.29015 8.25 7.48877 8.25C7.68738 8.25 7.87786 8.17098 8.0183 8.03033C8.15874 7.88968 8.23764 7.69891 8.23764 7.5C8.23764 7.30109 8.15874 7.11032 8.0183 6.96967C7.87786 6.82902 7.68738 6.75 7.48877 6.75ZM7.8632 0C11.2331 0 11.3155 2.6775 9.54818 3.5625C8.80679 3.93 8.47728 4.7175 8.335 5.415C8.69446 5.565 9.00899 5.7975 9.24863 6.0975C12.0195 4.5975 15 5.19 15 7.875C15 11.25 12.3265 11.325 11.4428 9.5475C11.0684 8.805 10.2746 8.475 9.57813 8.3325C9.42836 8.6925 9.19621 9 8.89665 9.255C10.3869 12.0225 9.79531 15 7.11433 15C3.74438 15 3.67698 12.315 5.44433 11.43C6.17823 11.0625 6.50774 10.2825 6.65751 9.5925C6.29056 9.4425 5.96855 9.2025 5.72891 8.9025C2.96555 10.3875 0 9.8025 0 7.125C0 3.75 2.666 3.6675 3.54967 5.445C3.92411 6.1875 4.71043 6.51 5.40689 6.6525C5.54918 6.2925 5.78882 5.9775 6.09586 5.7375C4.60559 2.97 5.1972 0 7.8632 0Z"></path>
+        </svg>
+    </div>
+    <p>Serverless small language model inference.</p>
+</div>
 """
 SEED = 0
     np.random.seed(SEED)
     torch.manual_seed(SEED)
+chatbot = gr.Chatbot(type="messages", show_label=False, height=None, scale=1)
+textbox = gr.Textbox(placeholder="Type a message...", autofocus=True, scale=7)
 # https://github.com/gradio-app/gradio/blob/main/gradio/chat_interface.py
 chat_interface = gr.ChatInterface(
     title=None,
     fn=generate,
+    chatbot=chatbot,
+    textbox=textbox,
     type="messages",  # interface type must match bot type
+    description=None,
     additional_inputs=[
         gr.Textbox(
+            label="System Message",
             lines=2,
             value="You are a helpful assistant. Be concise and precise.",
         ),
         gr.Dropdown(
             label="Model",
             filterable=False,
+            value="Qwen/Qwen2.5-0.5B-Instruct",
+            choices=list(CONFIG.keys()),
         ),
         gr.Slider(
             label="Max new tokens",
             info="Modulates next token probabilities.",
         ),
         gr.Slider(
+            # https://arxiv.org/abs/1909.05858
             label="Repetition penalty",
             minimum=1.0,
             maximum=2.0,
             info="Penalizes repeating tokens.",
         ),
         gr.Slider(
+            # https://arxiv.org/abs/1904.09751
             label="Top-p",
             minimum=0.05,
             maximum=1.0,
             info="Only tokens with cumulative probability p are considered (nucleus sampling).",
         ),
         gr.Slider(
+            # https://arxiv.org/pdf/1805.04833
             label="Top-k",
             minimum=1,
             maximum=100,
     ],
 )
+with gr.Blocks(head=HEAD, css="./app.css", fill_height=True) as demo:
+    gr.HTML(HEADER)
     chat_interface.render()
 if __name__ == "__main__":

lib/__init__.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from .generate import generate
-__all__ = ["generate"]

+from .config import CONFIG
 from .generate import generate
+__all__ = ["CONFIG", "generate"]

lib/config.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from transformers import (
+    GlmForCausalLM,
+    GPT2TokenizerFast,
+    GraniteForCausalLM,
+    LlamaForCausalLM,
+    LlamaTokenizerFast,
+    PreTrainedTokenizerFast,
+    Qwen2ForCausalLM,
+    Qwen2TokenizerFast,
+    StableLmForCausalLM,
+)
+CONFIG = {
+    "01-ai/Yi-Coder-1.5B-Chat": {
+        "model": LlamaForCausalLM,
+        "tokenizer": LlamaTokenizerFast,
+    },
+    "HuggingFaceTB/SmolLM2-135M-Instruct": {
+        "model": LlamaForCausalLM,
+        "tokenizer": GPT2TokenizerFast,
+    },
+    "HuggingFaceTB/SmolLM2-360M-Instruct": {
+        "model": LlamaForCausalLM,
+        "tokenizer": GPT2TokenizerFast,
+    },
+    "HuggingFaceTB/SmolLM2-1.7B-Instruct": {
+        "model": LlamaForCausalLM,
+        "tokenizer": GPT2TokenizerFast,
+    },
+    "ibm-granite/granite-3.0-2b-instruct": {
+        "model": GraniteForCausalLM,
+        "tokenizer": GPT2TokenizerFast,
+    },
+    "Qwen/Qwen2.5-0.5B-Instruct": {
+        "model": Qwen2ForCausalLM,
+        "tokenizer": Qwen2TokenizerFast,
+    },
+    "Qwen/Qwen2.5-1.5B-Instruct": {
+        "model": Qwen2ForCausalLM,
+        "tokenizer": Qwen2TokenizerFast,
+    },
+    "Qwen/Qwen2.5-Coder-1.5B-Instruct": {
+        "model": Qwen2ForCausalLM,
+        "tokenizer": Qwen2TokenizerFast,
+    },
+    "stabilityai/stablelm-2-zephyr-1_6b": {
+        "model": StableLmForCausalLM,
+        "tokenizer": GPT2TokenizerFast,
+    },
+    "THUDM/glm-edge-1.5b-chat": {
+        "model": GlmForCausalLM,
+        "tokenizer": PreTrainedTokenizerFast,
+    },
+}

lib/generate.py CHANGED Viewed

@@ -1,8 +1,6 @@
 from threading import Thread
 from typing import Iterator
-import torch
-from gradio import Error, Progress
 from spaces import GPU, config
 from transformers import TextIteratorStreamer
@@ -13,23 +11,19 @@ from .loader import get_loader
 def generate(
     message: str,
     chat_history: list[dict[str, str]],
-    system_prompt="",
-    model="HuggingFaceTB/SmolLM2-135M-Instruct",
     max_tokens=512,
     temperature=0.6,
     repetition_penalty=1.2,
     top_p=0.9,
     top_k=50,
-    _=Progress(track_tqdm=True),
 ) -> Iterator[str]:
-    if not torch.cuda.is_available():
-        raise Error("CUDA not available")
     # Prepend system prompt
     if not chat_history or chat_history[0].get("role") != "system":
-        chat_history.insert(0, {"role": "system", "content": system_prompt})
     else:
-        chat_history[0]["content"] = system_prompt
     # Append user message before generating
     chat_history.append({"role": "user", "content": message})
@@ -83,6 +77,7 @@ def transformers_generate(
         skip_special_tokens=True,
     )
     generate_kwargs = dict(
         do_sample=True,
         streamer=streamer,

 from threading import Thread
 from typing import Iterator
 from spaces import GPU, config
 from transformers import TextIteratorStreamer
 def generate(
     message: str,
     chat_history: list[dict[str, str]],
+    system_message="",
+    model="Qwen/Qwen2.5-0.5B-Instruct",
     max_tokens=512,
     temperature=0.6,
     repetition_penalty=1.2,
     top_p=0.9,
     top_k=50,
 ) -> Iterator[str]:
     # Prepend system prompt
     if not chat_history or chat_history[0].get("role") != "system":
+        chat_history.insert(0, {"role": "system", "content": system_message})
     else:
+        chat_history[0]["content"] = system_message
     # Append user message before generating
     chat_history.append({"role": "user", "content": message})
         skip_special_tokens=True,
     )
+    # https://huggingface.co/blog/how-to-generate
     generate_kwargs = dict(
         do_sample=True,
         streamer=streamer,

lib/loader.py CHANGED Viewed

@@ -1,15 +1,8 @@
 import os
 import torch
-from transformers import (
-    GlmForCausalLM,
-    GPT2Tokenizer,
-    LlamaForCausalLM,
-    LlamaTokenizer,
-    PreTrainedTokenizerFast,
-    Qwen2ForCausalLM,
-    Qwen2Tokenizer,
-)
 class Loader:
@@ -31,29 +24,10 @@ class Loader:
                 "low_cpu_mem_usage": True,
                 "torch_dtype": torch.bfloat16 if cuda_capability >= 8 else torch.float16,
             }
-            model_fns = {
-                # Could have used auto-classes or a pipeline
-                "01-ai/Yi-Coder-1.5B-Chat": LlamaForCausalLM.from_pretrained,
-                "HuggingFaceTB/SmolLM2-135M-Instruct": LlamaForCausalLM.from_pretrained,
-                "HuggingFaceTB/SmolLM2-360M-Instruct": LlamaForCausalLM.from_pretrained,
-                "HuggingFaceTB/SmolLM2-1.7B-Instruct": LlamaForCausalLM.from_pretrained,
-                "Qwen/Qwen2.5-0.5B-Instruct": Qwen2ForCausalLM.from_pretrained,
-                "Qwen/Qwen2.5-Coder-1.5B-Instruct": Qwen2ForCausalLM.from_pretrained,
-                "THUDM/glm-edge-1.5b-chat": GlmForCausalLM.from_pretrained,
-            }
-            model_tokenizers = {
-                "01-ai/Yi-Coder-1.5B-Chat": LlamaTokenizer,
-                "HuggingFaceTB/SmolLM2-135M-Instruct": GPT2Tokenizer,
-                "HuggingFaceTB/SmolLM2-360M-Instruct": GPT2Tokenizer,
-                "HuggingFaceTB/SmolLM2-1.7B-Instruct": GPT2Tokenizer,
-                "Qwen/Qwen2.5-0.5B-Instruct": Qwen2Tokenizer,
-                "Qwen/Qwen2.5-Coder-1.5B-Instruct": Qwen2Tokenizer,
-                "THUDM/glm-edge-1.5b-chat": PreTrainedTokenizerFast,
-            }
-            llm_fn = model_fns[model]
-            self.tokenizer = model_tokenizers[model].from_pretrained(model)
-            self.llm = llm_fn(model, **kwargs)
             self.llm.eval()
             self.model = model

 import os
 import torch
+from .config import CONFIG
 class Loader:
                 "low_cpu_mem_usage": True,
                 "torch_dtype": torch.bfloat16 if cuda_capability >= 8 else torch.float16,
             }
+            config = CONFIG[model]
+            self.tokenizer = config["tokenizer"].from_pretrained(model)
+            self.llm = config["model"].from_pretrained(model, **kwargs)
             self.llm.eval()
             self.model = model