Spaces:

adamelliotfields
/

chat

Running on Zero

App Files Files Community

adamelliotfields commited on Dec 6, 2024

Commit

cff83c8

verified ·

1 Parent(s): 5d43e33

second commit

Browse files

Files changed (9) hide show

.gitignore +2 -0
.vscode/settings.json +28 -0
README.md +78 -7
app.py +109 -4
lib/__init__.py +3 -0
lib/generate.py +107 -0
lib/loader.py +92 -0
requirements.txt +13 -0
ruff.toml +9 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__/
2	+ .venv/

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "editor.rulers": [110],
+  "files.exclude": {
+    "**/__pycache__": true,
+    ".venv/**": true
+  },
+  "files.watcherExclude": {
+    "**/__pycache__": true,
+    ".venv/**": true
+  },
+  "notebook.formatOnSave.enabled": true,
+  "notebook.codeActionsOnSave": {
+    "notebook.source.fixAll.ruff": "explicit",
+    "notebook.source.organizeImports.ruff": "explicit"
+  },
+  "[python]": {
+    "editor.defaultFormatter": "charliermarsh.ruff",
+    "editor.formatOnSave": true,
+    "editor.tabSize": 4,
+    "editor.codeActionsOnSave": {
+      "source.fixAll.ruff": "explicit",
+      "source.organizeImports.ruff": "explicit"
+    }
+  }
+}

README.md CHANGED Viewed

@@ -1,14 +1,85 @@
 ---
-title: Text
-emoji: 🌍
-colorFrom: yellow
-colorTo: red
 sdk: gradio
-sdk_version: 5.8.0
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: Simple app for small language model inference
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+# https://huggingface.co/docs/hub/en/spaces-config-reference
+title: Text Generation
+short_description: Simple app for small language model inference
+emoji: ⌨️
+colorFrom: blue
+colorTo: yellow
 sdk: gradio
+sdk_version: 4.44.1
+python_version: 3.11.9
 app_file: app.py
+fullWidth: false
 pinned: false
+header: default
 license: apache-2.0
+preload_from_hub:
+  - >-
+    01-ai/Yi-Coder-1.5B-Chat
+    config.json,generation_config.json,model.safetensors,special_tokens_map.json,tokenizer.model,tokenizer_config.json
+  - >-
+    google/gemma-2-2b-it
+    config.json,generation_config.json,model-00001-of-00002.safetensors,model-00002-of-00002.safetensors,model.safetensors.index.json,special_tokens_map.json,tokenizer.json,tokenizer.model,tokenizer_config.json
+  - >-
+    hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4
+    config.json,generation_config.json,model-00001-of-00002.safetensors,model-00002-of-00002.safetensors,model.safetensors.index.json,special_tokens_map.json,tokenizer.json,tokenizer_config.json
+  - >-
+    HuggingFaceTB/SmolLM2-135M-Instruct
+    config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
+  - >-
+    HuggingFaceTB/SmolLM2-360M-Instruct
+    config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
+  - >-
+    HuggingFaceTB/SmolLM2-1.7B-Instruct
+    config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
+  - >-
+    meta-llama/Llama-3.2-1B-Instruct
+    config.json,generation_config.json,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json
+  - >-
+    Qwen/Qwen2.5-0.5B-Instruct
+    config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
+  - >-
+    Qwen/Qwen2.5-Coder-1.5B-Instruct
+    config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
+  - >-
+    THUDM/glm-edge-1.5b-chat
+    config.json,generation_config.json,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json
 ---
+# text
+Simple app for small language model inference.
+## Installation
+```bash
+# clone
+git clone https://huggingface.co/spaces/adamelliotfields/text.git
+cd text
+git remote set-url origin https://adamelliotfields:[email protected]/spaces/adamelliotfields/text
+# install
+uv venv
+uv pip install -r requirements.txt
+# gradio
+source .venv/bin/activate
+gradio app.py
+```
+## Development
+See [pull requests and discussions](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions).
+```sh
+git fetch origin refs/pr/42:pr/42
+git checkout pr/42
+# ...
+git add .
+git commit -m "Commit message"
+git push origin pr/42:refs/pr/42
+```
+## Gated Models
+If you get an `OSError` about a model not existing, run `huggingface-cli login` to create a `~/.cache/huggingface/token` (after accepting the terms for the model on the website).

app.py CHANGED Viewed

@@ -1,7 +1,112 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

+import random
 import gradio as gr
+import numpy as np
+import torch
+from lib import generate
+HEAD = """
+<style>
+    @media (min-width: 1536px) {
+        gradio-app > .gradio-container { max-width: 1280px !important }
+    }
+</style>
+"""
+TITLE = """
+<h1>Text Generation</h1>
+"""
+SEED = 0
+PORT = 7860
+if gr.NO_RELOAD:
+    random.seed(SEED)
+    np.random.seed(SEED)
+    torch.manual_seed(SEED)
+# https://github.com/gradio-app/gradio/blob/main/gradio/chat_interface.py
+chat_interface = gr.ChatInterface(
+    title=None,
+    fn=generate,
+    type="messages",  # interface type must match bot type
+    description="Simple app for small language model inference.",
+    chatbot=gr.Chatbot(type="messages", show_label=False, height=None, scale=1),
+    textbox=gr.Textbox(placeholder="Type a message...", autofocus=True, scale=7),
+    additional_inputs=[
+        gr.Textbox(
+            label="System Prompt",
+            lines=2,
+            value="You are a helpful assistant. Be concise and precise.",
+        ),
+        gr.Dropdown(
+            label="Model",
+            filterable=False,
+            value="HuggingFaceTB/SmolLM2-135M-Instruct",
+            choices=[
+                "01-ai/Yi-Coder-1.5B-Chat",
+                "google/gemma-2-2b-it",
+                "hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4",
+                "HuggingFaceTB/SmolLM2-135M-Instruct",
+                "HuggingFaceTB/SmolLM2-360M-Instruct",
+                "HuggingFaceTB/SmolLM2-1.7B-Instruct",
+                "meta-llama/Llama-3.2-1B-Instruct",
+                "Qwen/Qwen2.5-0.5B-Instruct",
+                "Qwen/Qwen2.5-Coder-1.5B-Instruct",
+                "THUDM/glm-edge-1.5b-chat",
+            ],
+        ),
+        gr.Slider(
+            label="Max new tokens",
+            minimum=1,
+            maximum=2048,
+            step=1,
+            value=512,
+            info="Maximum number of new tokens to generate.",
+        ),
+        gr.Slider(
+            label="Temperature",
+            minimum=0.1,
+            maximum=2.0,
+            step=0.1,
+            value=0.6,
+            info="Modulates next token probabilities.",
+        ),
+        gr.Slider(
+            label="Repetition penalty",
+            minimum=1.0,
+            maximum=2.0,
+            step=0.05,
+            value=1.2,
+            info="Penalizes repeating tokens.",
+        ),
+        gr.Slider(
+            label="Top-p",
+            minimum=0.05,
+            maximum=1.0,
+            step=0.05,
+            value=0.9,
+            info="Only tokens with cumulative probability p are considered (nucleus sampling).",
+        ),
+        gr.Slider(
+            label="Top-k",
+            minimum=1,
+            maximum=100,
+            step=1,
+            value=50,
+            info="Only k-th highest probability tokens are considered.",
+        ),
+    ],
+)
+with gr.Blocks(head=HEAD, fill_height=True) as demo:
+    gr.HTML(TITLE)
+    chat_interface.render()
+if __name__ == "__main__":
+    demo.queue(default_concurrency_limit=1).launch(
+        server_name="0.0.0.0",
+        server_port=PORT,
+    )

lib/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .generate import generate
2	+
3	+ __all__ = ["generate"]

lib/generate.py ADDED Viewed

	@@ -0,0 +1,107 @@

+from threading import Thread
+from typing import Iterator
+import torch
+from gradio import Error, Progress
+from spaces import GPU, config
+from transformers import TextIteratorStreamer
+from .loader import get_loader
+@GPU
+def generate(
+    message: str,
+    chat_history: list[dict[str, str]],
+    system_prompt="",
+    model="HuggingFaceTB/SmolLM2-135M-Instruct",
+    max_tokens=512,
+    temperature=0.6,
+    repetition_penalty=1.2,
+    top_p=0.9,
+    top_k=50,
+    _=Progress(track_tqdm=True),
+) -> Iterator[str]:
+    if not torch.cuda.is_available():
+        raise Error("CUDA not available")
+    # Prepend system prompt
+    if not chat_history or chat_history[0].get("role") != "system":
+        chat_history.insert(0, {"role": "system", "content": system_prompt})
+    else:
+        chat_history[0]["content"] = system_prompt
+    # Append user message before generating
+    chat_history.append({"role": "user", "content": message})
+    yield from transformers_generate(
+        chat_history,
+        model,
+        max_tokens,
+        temperature,
+        repetition_penalty,
+        top_p,
+        top_k,
+    )
+def transformers_generate(
+    chat_history: list[dict[str, str]],
+    model: str,
+    max_tokens: int,
+    temperature: float,
+    repetition_penalty: float,
+    top_p: float,
+    top_k: int,
+) -> Iterator[str]:
+    loader = get_loader(singleton=not config.Config.zero_gpu)
+    loader.load(model)
+    llm = loader.llm
+    tokenizer = loader.tokenizer
+    # Handle models that don't have a padding token
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    # https://huggingface.co/docs/transformers/main/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template
+    results = tokenizer.apply_chat_template(
+        chat_history,
+        tokenize=True,
+        return_dict=True,  # get the attention mask
+        return_tensors="pt",
+        # https://huggingface.co/docs/transformers/chat_templating#what-are-generation-prompts
+        add_generation_prompt=True,
+    )
+    input_ids = results["input_ids"].to(llm.device)
+    attention_mask = results["attention_mask"].to(llm.device)
+    streamer = TextIteratorStreamer(
+        tokenizer,
+        skip_prompt=True,
+        skip_special_tokens=True,
+    )
+    generate_kwargs = dict(
+        do_sample=True,
+        streamer=streamer,
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        pad_token_id=tokenizer.pad_token_id,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        max_new_tokens=max_tokens,
+        repetition_penalty=repetition_penalty,
+    )
+    # Stream text off the main thread
+    t = Thread(target=llm.generate, kwargs=generate_kwargs)
+    t.start()
+    # Collect output tokens
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        yield "".join(outputs)

lib/loader.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import os
+import torch
+from transformers import (
+    AutoConfig,
+    Gemma2ForCausalLM,
+    GemmaTokenizer,
+    GlmForCausalLM,
+    GPT2Tokenizer,
+    LlamaForCausalLM,
+    LlamaTokenizer,
+    PreTrainedTokenizerFast,
+    Qwen2ForCausalLM,
+    Qwen2Tokenizer,
+)
+class Loader:
+    def __init__(self):
+        self.model = ""
+        self.llm = None
+        self.tokenizer = None
+    def load(self, model):
+        if model != self.model:
+            token = os.getenv("HF_TOKEN", None)
+            cuda_capability = torch.cuda.get_device_capability()[0]
+            # Set device_map and low_cpu_mem_usage to stream weights from disk to GPU with Accelerate
+            # See https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py
+            kwargs = {
+                "token": token,
+                "device_map": "auto",
+                "low_cpu_mem_usage": True,
+                "torch_dtype": torch.bfloat16 if cuda_capability >= 8 else torch.float16,
+            }
+            model_fns = {
+                # Could have used auto-classes or a pipeline
+                "01-ai/Yi-Coder-1.5B-Chat": LlamaForCausalLM.from_pretrained,
+                "google/gemma-2-2b-it": Gemma2ForCausalLM.from_pretrained,
+                "hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4": LlamaForCausalLM.from_pretrained,
+                "HuggingFaceTB/SmolLM2-135M-Instruct": LlamaForCausalLM.from_pretrained,
+                "HuggingFaceTB/SmolLM2-360M-Instruct": LlamaForCausalLM.from_pretrained,
+                "HuggingFaceTB/SmolLM2-1.7B-Instruct": LlamaForCausalLM.from_pretrained,
+                "meta-llama/Llama-3.2-1B-Instruct": LlamaForCausalLM.from_pretrained,
+                "Qwen/Qwen2.5-0.5B-Instruct": Qwen2ForCausalLM.from_pretrained,
+                "Qwen/Qwen2.5-Coder-1.5B-Instruct": Qwen2ForCausalLM.from_pretrained,
+                "THUDM/glm-edge-1.5b-chat": GlmForCausalLM.from_pretrained,
+            }
+            model_tokenizers = {
+                "01-ai/Yi-Coder-1.5B-Chat": LlamaTokenizer,
+                "google/gemma-2-2b-it": GemmaTokenizer,
+                "hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4": PreTrainedTokenizerFast,
+                "HuggingFaceTB/SmolLM2-135M-Instruct": GPT2Tokenizer,
+                "HuggingFaceTB/SmolLM2-360M-Instruct": GPT2Tokenizer,
+                "HuggingFaceTB/SmolLM2-1.7B-Instruct": GPT2Tokenizer,
+                "meta-llama/Llama-3.2-1B-Instruct": PreTrainedTokenizerFast,
+                "Qwen/Qwen2.5-0.5B-Instruct": Qwen2Tokenizer,
+                "Qwen/Qwen2.5-Coder-1.5B-Instruct": Qwen2Tokenizer,
+                "THUDM/glm-edge-1.5b-chat": PreTrainedTokenizerFast,
+            }
+            llm_fn = model_fns[model]
+            self.tokenizer = model_tokenizers[model].from_pretrained(model)
+            if model == "hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4":
+                # Remove unused settings
+                config = AutoConfig.from_pretrained(model)
+                for key in ["_load_in_4bit", "_load_in_8bit", "quant_method"]:
+                    del config.quantization_config[key]
+                self.llm = llm_fn(model, config=config, **kwargs)
+            else:
+                self.llm = llm_fn(model, **kwargs)
+            self.llm.eval()
+            self.model = model
+            # Clean up
+            torch.cuda.empty_cache()
+            torch.cuda.ipc_collect()
+            torch.cuda.reset_peak_memory_stats()
+            torch.cuda.synchronize()
+# Get a singleton or new instance
+def get_loader(singleton=False):
+    if not singleton:
+        return Loader()
+    else:
+        if not hasattr(get_loader, "_instance"):
+            get_loader._instance = Loader()
+        return get_loader._instance

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+accelerate
+bitsandbytes
+gradio==4.44.1
+hf-transfer
+numpy==1.26.4
+ruff==0.6.9
+sentencepiece
+setuptools
+spaces==0.30.4
+torch==2.4.0
+torchaudio==2.4.0
+torchvision==0.19.0
+transformers==4.46.3

ruff.toml ADDED Viewed

	@@ -0,0 +1,9 @@

+extend-include = ["*.ipynb"]
+line-length = 110
+[lint]
+ignore = ["F401"]
+[lint.per-file-ignores]
+"*.ipynb" = ["E402"]