adamelliotfields commited on
Commit
cff83c8
·
verified ·
1 Parent(s): 5d43e33

second commit

Browse files
Files changed (9) hide show
  1. .gitignore +2 -0
  2. .vscode/settings.json +28 -0
  3. README.md +78 -7
  4. app.py +109 -4
  5. lib/__init__.py +3 -0
  6. lib/generate.py +107 -0
  7. lib/loader.py +92 -0
  8. requirements.txt +13 -0
  9. ruff.toml +9 -0
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__/
2
+ .venv/
.vscode/settings.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "editor.rulers": [110],
3
+
4
+ "files.exclude": {
5
+ "**/__pycache__": true,
6
+ ".venv/**": true
7
+ },
8
+ "files.watcherExclude": {
9
+ "**/__pycache__": true,
10
+ ".venv/**": true
11
+ },
12
+
13
+ "notebook.formatOnSave.enabled": true,
14
+ "notebook.codeActionsOnSave": {
15
+ "notebook.source.fixAll.ruff": "explicit",
16
+ "notebook.source.organizeImports.ruff": "explicit"
17
+ },
18
+
19
+ "[python]": {
20
+ "editor.defaultFormatter": "charliermarsh.ruff",
21
+ "editor.formatOnSave": true,
22
+ "editor.tabSize": 4,
23
+ "editor.codeActionsOnSave": {
24
+ "source.fixAll.ruff": "explicit",
25
+ "source.organizeImports.ruff": "explicit"
26
+ }
27
+ }
28
+ }
README.md CHANGED
@@ -1,14 +1,85 @@
1
  ---
2
- title: Text
3
- emoji: 🌍
4
- colorFrom: yellow
5
- colorTo: red
 
 
6
  sdk: gradio
7
- sdk_version: 5.8.0
 
8
  app_file: app.py
 
9
  pinned: false
 
10
  license: apache-2.0
11
- short_description: Simple app for small language model inference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ # https://huggingface.co/docs/hub/en/spaces-config-reference
3
+ title: Text Generation
4
+ short_description: Simple app for small language model inference
5
+ emoji: ⌨️
6
+ colorFrom: blue
7
+ colorTo: yellow
8
  sdk: gradio
9
+ sdk_version: 4.44.1
10
+ python_version: 3.11.9
11
  app_file: app.py
12
+ fullWidth: false
13
  pinned: false
14
+ header: default
15
  license: apache-2.0
16
+ preload_from_hub:
17
+ - >-
18
+ 01-ai/Yi-Coder-1.5B-Chat
19
+ config.json,generation_config.json,model.safetensors,special_tokens_map.json,tokenizer.model,tokenizer_config.json
20
+ - >-
21
+ google/gemma-2-2b-it
22
+ config.json,generation_config.json,model-00001-of-00002.safetensors,model-00002-of-00002.safetensors,model.safetensors.index.json,special_tokens_map.json,tokenizer.json,tokenizer.model,tokenizer_config.json
23
+ - >-
24
+ hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4
25
+ config.json,generation_config.json,model-00001-of-00002.safetensors,model-00002-of-00002.safetensors,model.safetensors.index.json,special_tokens_map.json,tokenizer.json,tokenizer_config.json
26
+ - >-
27
+ HuggingFaceTB/SmolLM2-135M-Instruct
28
+ config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
29
+ - >-
30
+ HuggingFaceTB/SmolLM2-360M-Instruct
31
+ config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
32
+ - >-
33
+ HuggingFaceTB/SmolLM2-1.7B-Instruct
34
+ config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
35
+ - >-
36
+ meta-llama/Llama-3.2-1B-Instruct
37
+ config.json,generation_config.json,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json
38
+ - >-
39
+ Qwen/Qwen2.5-0.5B-Instruct
40
+ config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
41
+ - >-
42
+ Qwen/Qwen2.5-Coder-1.5B-Instruct
43
+ config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
44
+ - >-
45
+ THUDM/glm-edge-1.5b-chat
46
+ config.json,generation_config.json,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json
47
  ---
48
 
49
+ # text
50
+
51
+ Simple app for small language model inference.
52
+
53
+ ## Installation
54
+
55
+ ```bash
56
+ # clone
57
+ git clone https://huggingface.co/spaces/adamelliotfields/text.git
58
+ cd text
59
+ git remote set-url origin https://adamelliotfields:[email protected]/spaces/adamelliotfields/text
60
+
61
+ # install
62
+ uv venv
63
+ uv pip install -r requirements.txt
64
+
65
+ # gradio
66
+ source .venv/bin/activate
67
+ gradio app.py
68
+ ```
69
+
70
+ ## Development
71
+
72
+ See [pull requests and discussions](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions).
73
+
74
+ ```sh
75
+ git fetch origin refs/pr/42:pr/42
76
+ git checkout pr/42
77
+ # ...
78
+ git add .
79
+ git commit -m "Commit message"
80
+ git push origin pr/42:refs/pr/42
81
+ ```
82
+
83
+ ## Gated Models
84
+
85
+ If you get an `OSError` about a model not existing, run `huggingface-cli login` to create a `~/.cache/huggingface/token` (after accepting the terms for the model on the website).
app.py CHANGED
@@ -1,7 +1,112 @@
 
 
1
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
1
+ import random
2
+
3
  import gradio as gr
4
+ import numpy as np
5
+ import torch
6
+
7
+ from lib import generate
8
+
9
+ HEAD = """
10
+ <style>
11
+ @media (min-width: 1536px) {
12
+ gradio-app > .gradio-container { max-width: 1280px !important }
13
+ }
14
+ </style>
15
+ """
16
+
17
+ TITLE = """
18
+ <h1>Text Generation</h1>
19
+ """
20
+
21
+ SEED = 0
22
+ PORT = 7860
23
+
24
+ if gr.NO_RELOAD:
25
+ random.seed(SEED)
26
+ np.random.seed(SEED)
27
+ torch.manual_seed(SEED)
28
+
29
+ # https://github.com/gradio-app/gradio/blob/main/gradio/chat_interface.py
30
+ chat_interface = gr.ChatInterface(
31
+ title=None,
32
+ fn=generate,
33
+ type="messages", # interface type must match bot type
34
+ description="Simple app for small language model inference.",
35
+ chatbot=gr.Chatbot(type="messages", show_label=False, height=None, scale=1),
36
+ textbox=gr.Textbox(placeholder="Type a message...", autofocus=True, scale=7),
37
+ additional_inputs=[
38
+ gr.Textbox(
39
+ label="System Prompt",
40
+ lines=2,
41
+ value="You are a helpful assistant. Be concise and precise.",
42
+ ),
43
+ gr.Dropdown(
44
+ label="Model",
45
+ filterable=False,
46
+ value="HuggingFaceTB/SmolLM2-135M-Instruct",
47
+ choices=[
48
+ "01-ai/Yi-Coder-1.5B-Chat",
49
+ "google/gemma-2-2b-it",
50
+ "hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4",
51
+ "HuggingFaceTB/SmolLM2-135M-Instruct",
52
+ "HuggingFaceTB/SmolLM2-360M-Instruct",
53
+ "HuggingFaceTB/SmolLM2-1.7B-Instruct",
54
+ "meta-llama/Llama-3.2-1B-Instruct",
55
+ "Qwen/Qwen2.5-0.5B-Instruct",
56
+ "Qwen/Qwen2.5-Coder-1.5B-Instruct",
57
+ "THUDM/glm-edge-1.5b-chat",
58
+ ],
59
+ ),
60
+ gr.Slider(
61
+ label="Max new tokens",
62
+ minimum=1,
63
+ maximum=2048,
64
+ step=1,
65
+ value=512,
66
+ info="Maximum number of new tokens to generate.",
67
+ ),
68
+ gr.Slider(
69
+ label="Temperature",
70
+ minimum=0.1,
71
+ maximum=2.0,
72
+ step=0.1,
73
+ value=0.6,
74
+ info="Modulates next token probabilities.",
75
+ ),
76
+ gr.Slider(
77
+ label="Repetition penalty",
78
+ minimum=1.0,
79
+ maximum=2.0,
80
+ step=0.05,
81
+ value=1.2,
82
+ info="Penalizes repeating tokens.",
83
+ ),
84
+ gr.Slider(
85
+ label="Top-p",
86
+ minimum=0.05,
87
+ maximum=1.0,
88
+ step=0.05,
89
+ value=0.9,
90
+ info="Only tokens with cumulative probability p are considered (nucleus sampling).",
91
+ ),
92
+ gr.Slider(
93
+ label="Top-k",
94
+ minimum=1,
95
+ maximum=100,
96
+ step=1,
97
+ value=50,
98
+ info="Only k-th highest probability tokens are considered.",
99
+ ),
100
+ ],
101
+ )
102
+
103
 
104
+ with gr.Blocks(head=HEAD, fill_height=True) as demo:
105
+ gr.HTML(TITLE)
106
+ chat_interface.render()
107
 
108
+ if __name__ == "__main__":
109
+ demo.queue(default_concurrency_limit=1).launch(
110
+ server_name="0.0.0.0",
111
+ server_port=PORT,
112
+ )
lib/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .generate import generate
2
+
3
+ __all__ = ["generate"]
lib/generate.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from threading import Thread
2
+ from typing import Iterator
3
+
4
+ import torch
5
+ from gradio import Error, Progress
6
+ from spaces import GPU, config
7
+ from transformers import TextIteratorStreamer
8
+
9
+ from .loader import get_loader
10
+
11
+
12
+ @GPU
13
+ def generate(
14
+ message: str,
15
+ chat_history: list[dict[str, str]],
16
+ system_prompt="",
17
+ model="HuggingFaceTB/SmolLM2-135M-Instruct",
18
+ max_tokens=512,
19
+ temperature=0.6,
20
+ repetition_penalty=1.2,
21
+ top_p=0.9,
22
+ top_k=50,
23
+ _=Progress(track_tqdm=True),
24
+ ) -> Iterator[str]:
25
+ if not torch.cuda.is_available():
26
+ raise Error("CUDA not available")
27
+
28
+ # Prepend system prompt
29
+ if not chat_history or chat_history[0].get("role") != "system":
30
+ chat_history.insert(0, {"role": "system", "content": system_prompt})
31
+ else:
32
+ chat_history[0]["content"] = system_prompt
33
+
34
+ # Append user message before generating
35
+ chat_history.append({"role": "user", "content": message})
36
+
37
+ yield from transformers_generate(
38
+ chat_history,
39
+ model,
40
+ max_tokens,
41
+ temperature,
42
+ repetition_penalty,
43
+ top_p,
44
+ top_k,
45
+ )
46
+
47
+
48
+ def transformers_generate(
49
+ chat_history: list[dict[str, str]],
50
+ model: str,
51
+ max_tokens: int,
52
+ temperature: float,
53
+ repetition_penalty: float,
54
+ top_p: float,
55
+ top_k: int,
56
+ ) -> Iterator[str]:
57
+ loader = get_loader(singleton=not config.Config.zero_gpu)
58
+ loader.load(model)
59
+
60
+ llm = loader.llm
61
+ tokenizer = loader.tokenizer
62
+
63
+ # Handle models that don't have a padding token
64
+ if tokenizer.pad_token_id is None:
65
+ tokenizer.pad_token_id = tokenizer.eos_token_id
66
+
67
+ # https://huggingface.co/docs/transformers/main/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template
68
+ results = tokenizer.apply_chat_template(
69
+ chat_history,
70
+ tokenize=True,
71
+ return_dict=True, # get the attention mask
72
+ return_tensors="pt",
73
+ # https://huggingface.co/docs/transformers/chat_templating#what-are-generation-prompts
74
+ add_generation_prompt=True,
75
+ )
76
+
77
+ input_ids = results["input_ids"].to(llm.device)
78
+ attention_mask = results["attention_mask"].to(llm.device)
79
+
80
+ streamer = TextIteratorStreamer(
81
+ tokenizer,
82
+ skip_prompt=True,
83
+ skip_special_tokens=True,
84
+ )
85
+
86
+ generate_kwargs = dict(
87
+ do_sample=True,
88
+ streamer=streamer,
89
+ input_ids=input_ids,
90
+ attention_mask=attention_mask,
91
+ pad_token_id=tokenizer.pad_token_id,
92
+ top_p=top_p,
93
+ top_k=top_k,
94
+ temperature=temperature,
95
+ max_new_tokens=max_tokens,
96
+ repetition_penalty=repetition_penalty,
97
+ )
98
+
99
+ # Stream text off the main thread
100
+ t = Thread(target=llm.generate, kwargs=generate_kwargs)
101
+ t.start()
102
+
103
+ # Collect output tokens
104
+ outputs = []
105
+ for text in streamer:
106
+ outputs.append(text)
107
+ yield "".join(outputs)
lib/loader.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ import torch
4
+ from transformers import (
5
+ AutoConfig,
6
+ Gemma2ForCausalLM,
7
+ GemmaTokenizer,
8
+ GlmForCausalLM,
9
+ GPT2Tokenizer,
10
+ LlamaForCausalLM,
11
+ LlamaTokenizer,
12
+ PreTrainedTokenizerFast,
13
+ Qwen2ForCausalLM,
14
+ Qwen2Tokenizer,
15
+ )
16
+
17
+
18
+ class Loader:
19
+ def __init__(self):
20
+ self.model = ""
21
+ self.llm = None
22
+ self.tokenizer = None
23
+
24
+ def load(self, model):
25
+ if model != self.model:
26
+ token = os.getenv("HF_TOKEN", None)
27
+ cuda_capability = torch.cuda.get_device_capability()[0]
28
+
29
+ # Set device_map and low_cpu_mem_usage to stream weights from disk to GPU with Accelerate
30
+ # See https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py
31
+ kwargs = {
32
+ "token": token,
33
+ "device_map": "auto",
34
+ "low_cpu_mem_usage": True,
35
+ "torch_dtype": torch.bfloat16 if cuda_capability >= 8 else torch.float16,
36
+ }
37
+ model_fns = {
38
+ # Could have used auto-classes or a pipeline
39
+ "01-ai/Yi-Coder-1.5B-Chat": LlamaForCausalLM.from_pretrained,
40
+ "google/gemma-2-2b-it": Gemma2ForCausalLM.from_pretrained,
41
+ "hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4": LlamaForCausalLM.from_pretrained,
42
+ "HuggingFaceTB/SmolLM2-135M-Instruct": LlamaForCausalLM.from_pretrained,
43
+ "HuggingFaceTB/SmolLM2-360M-Instruct": LlamaForCausalLM.from_pretrained,
44
+ "HuggingFaceTB/SmolLM2-1.7B-Instruct": LlamaForCausalLM.from_pretrained,
45
+ "meta-llama/Llama-3.2-1B-Instruct": LlamaForCausalLM.from_pretrained,
46
+ "Qwen/Qwen2.5-0.5B-Instruct": Qwen2ForCausalLM.from_pretrained,
47
+ "Qwen/Qwen2.5-Coder-1.5B-Instruct": Qwen2ForCausalLM.from_pretrained,
48
+ "THUDM/glm-edge-1.5b-chat": GlmForCausalLM.from_pretrained,
49
+ }
50
+ model_tokenizers = {
51
+ "01-ai/Yi-Coder-1.5B-Chat": LlamaTokenizer,
52
+ "google/gemma-2-2b-it": GemmaTokenizer,
53
+ "hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4": PreTrainedTokenizerFast,
54
+ "HuggingFaceTB/SmolLM2-135M-Instruct": GPT2Tokenizer,
55
+ "HuggingFaceTB/SmolLM2-360M-Instruct": GPT2Tokenizer,
56
+ "HuggingFaceTB/SmolLM2-1.7B-Instruct": GPT2Tokenizer,
57
+ "meta-llama/Llama-3.2-1B-Instruct": PreTrainedTokenizerFast,
58
+ "Qwen/Qwen2.5-0.5B-Instruct": Qwen2Tokenizer,
59
+ "Qwen/Qwen2.5-Coder-1.5B-Instruct": Qwen2Tokenizer,
60
+ "THUDM/glm-edge-1.5b-chat": PreTrainedTokenizerFast,
61
+ }
62
+
63
+ llm_fn = model_fns[model]
64
+ self.tokenizer = model_tokenizers[model].from_pretrained(model)
65
+
66
+ if model == "hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4":
67
+ # Remove unused settings
68
+ config = AutoConfig.from_pretrained(model)
69
+ for key in ["_load_in_4bit", "_load_in_8bit", "quant_method"]:
70
+ del config.quantization_config[key]
71
+ self.llm = llm_fn(model, config=config, **kwargs)
72
+ else:
73
+ self.llm = llm_fn(model, **kwargs)
74
+
75
+ self.llm.eval()
76
+ self.model = model
77
+
78
+ # Clean up
79
+ torch.cuda.empty_cache()
80
+ torch.cuda.ipc_collect()
81
+ torch.cuda.reset_peak_memory_stats()
82
+ torch.cuda.synchronize()
83
+
84
+
85
+ # Get a singleton or new instance
86
+ def get_loader(singleton=False):
87
+ if not singleton:
88
+ return Loader()
89
+ else:
90
+ if not hasattr(get_loader, "_instance"):
91
+ get_loader._instance = Loader()
92
+ return get_loader._instance
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ accelerate
2
+ bitsandbytes
3
+ gradio==4.44.1
4
+ hf-transfer
5
+ numpy==1.26.4
6
+ ruff==0.6.9
7
+ sentencepiece
8
+ setuptools
9
+ spaces==0.30.4
10
+ torch==2.4.0
11
+ torchaudio==2.4.0
12
+ torchvision==0.19.0
13
+ transformers==4.46.3
ruff.toml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ extend-include = ["*.ipynb"]
2
+
3
+ line-length = 110
4
+
5
+ [lint]
6
+ ignore = ["F401"]
7
+
8
+ [lint.per-file-ignores]
9
+ "*.ipynb" = ["E402"]