Spaces:
Running
on
Zero
Running
on
Zero
second commit
Browse files- .gitignore +2 -0
- .vscode/settings.json +28 -0
- README.md +78 -7
- app.py +109 -4
- lib/__init__.py +3 -0
- lib/generate.py +107 -0
- lib/loader.py +92 -0
- requirements.txt +13 -0
- ruff.toml +9 -0
.gitignore
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
.venv/
|
.vscode/settings.json
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"editor.rulers": [110],
|
3 |
+
|
4 |
+
"files.exclude": {
|
5 |
+
"**/__pycache__": true,
|
6 |
+
".venv/**": true
|
7 |
+
},
|
8 |
+
"files.watcherExclude": {
|
9 |
+
"**/__pycache__": true,
|
10 |
+
".venv/**": true
|
11 |
+
},
|
12 |
+
|
13 |
+
"notebook.formatOnSave.enabled": true,
|
14 |
+
"notebook.codeActionsOnSave": {
|
15 |
+
"notebook.source.fixAll.ruff": "explicit",
|
16 |
+
"notebook.source.organizeImports.ruff": "explicit"
|
17 |
+
},
|
18 |
+
|
19 |
+
"[python]": {
|
20 |
+
"editor.defaultFormatter": "charliermarsh.ruff",
|
21 |
+
"editor.formatOnSave": true,
|
22 |
+
"editor.tabSize": 4,
|
23 |
+
"editor.codeActionsOnSave": {
|
24 |
+
"source.fixAll.ruff": "explicit",
|
25 |
+
"source.organizeImports.ruff": "explicit"
|
26 |
+
}
|
27 |
+
}
|
28 |
+
}
|
README.md
CHANGED
@@ -1,14 +1,85 @@
|
|
1 |
---
|
2 |
-
|
3 |
-
|
4 |
-
|
5 |
-
|
|
|
|
|
6 |
sdk: gradio
|
7 |
-
sdk_version:
|
|
|
8 |
app_file: app.py
|
|
|
9 |
pinned: false
|
|
|
10 |
license: apache-2.0
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
---
|
13 |
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
---
|
2 |
+
# https://huggingface.co/docs/hub/en/spaces-config-reference
|
3 |
+
title: Text Generation
|
4 |
+
short_description: Simple app for small language model inference
|
5 |
+
emoji: ⌨️
|
6 |
+
colorFrom: blue
|
7 |
+
colorTo: yellow
|
8 |
sdk: gradio
|
9 |
+
sdk_version: 4.44.1
|
10 |
+
python_version: 3.11.9
|
11 |
app_file: app.py
|
12 |
+
fullWidth: false
|
13 |
pinned: false
|
14 |
+
header: default
|
15 |
license: apache-2.0
|
16 |
+
preload_from_hub:
|
17 |
+
- >-
|
18 |
+
01-ai/Yi-Coder-1.5B-Chat
|
19 |
+
config.json,generation_config.json,model.safetensors,special_tokens_map.json,tokenizer.model,tokenizer_config.json
|
20 |
+
- >-
|
21 |
+
google/gemma-2-2b-it
|
22 |
+
config.json,generation_config.json,model-00001-of-00002.safetensors,model-00002-of-00002.safetensors,model.safetensors.index.json,special_tokens_map.json,tokenizer.json,tokenizer.model,tokenizer_config.json
|
23 |
+
- >-
|
24 |
+
hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4
|
25 |
+
config.json,generation_config.json,model-00001-of-00002.safetensors,model-00002-of-00002.safetensors,model.safetensors.index.json,special_tokens_map.json,tokenizer.json,tokenizer_config.json
|
26 |
+
- >-
|
27 |
+
HuggingFaceTB/SmolLM2-135M-Instruct
|
28 |
+
config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
|
29 |
+
- >-
|
30 |
+
HuggingFaceTB/SmolLM2-360M-Instruct
|
31 |
+
config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
|
32 |
+
- >-
|
33 |
+
HuggingFaceTB/SmolLM2-1.7B-Instruct
|
34 |
+
config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
|
35 |
+
- >-
|
36 |
+
meta-llama/Llama-3.2-1B-Instruct
|
37 |
+
config.json,generation_config.json,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json
|
38 |
+
- >-
|
39 |
+
Qwen/Qwen2.5-0.5B-Instruct
|
40 |
+
config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
|
41 |
+
- >-
|
42 |
+
Qwen/Qwen2.5-Coder-1.5B-Instruct
|
43 |
+
config.json,generation_config.json,merges.txt,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json,vocab.json
|
44 |
+
- >-
|
45 |
+
THUDM/glm-edge-1.5b-chat
|
46 |
+
config.json,generation_config.json,model.safetensors,special_tokens_map.json,tokenizer.json,tokenizer_config.json
|
47 |
---
|
48 |
|
49 |
+
# text
|
50 |
+
|
51 |
+
Simple app for small language model inference.
|
52 |
+
|
53 |
+
## Installation
|
54 |
+
|
55 |
+
```bash
|
56 |
+
# clone
|
57 |
+
git clone https://huggingface.co/spaces/adamelliotfields/text.git
|
58 |
+
cd text
|
59 |
+
git remote set-url origin https://adamelliotfields:[email protected]/spaces/adamelliotfields/text
|
60 |
+
|
61 |
+
# install
|
62 |
+
uv venv
|
63 |
+
uv pip install -r requirements.txt
|
64 |
+
|
65 |
+
# gradio
|
66 |
+
source .venv/bin/activate
|
67 |
+
gradio app.py
|
68 |
+
```
|
69 |
+
|
70 |
+
## Development
|
71 |
+
|
72 |
+
See [pull requests and discussions](https://huggingface.co/docs/hub/en/repositories-pull-requests-discussions).
|
73 |
+
|
74 |
+
```sh
|
75 |
+
git fetch origin refs/pr/42:pr/42
|
76 |
+
git checkout pr/42
|
77 |
+
# ...
|
78 |
+
git add .
|
79 |
+
git commit -m "Commit message"
|
80 |
+
git push origin pr/42:refs/pr/42
|
81 |
+
```
|
82 |
+
|
83 |
+
## Gated Models
|
84 |
+
|
85 |
+
If you get an `OSError` about a model not existing, run `huggingface-cli login` to create a `~/.cache/huggingface/token` (after accepting the terms for the model on the website).
|
app.py
CHANGED
@@ -1,7 +1,112 @@
|
|
|
|
|
|
1 |
import gradio as gr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
-
|
4 |
-
|
|
|
5 |
|
6 |
-
|
7 |
-
demo.launch(
|
|
|
|
|
|
|
|
1 |
+
import random
|
2 |
+
|
3 |
import gradio as gr
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
|
7 |
+
from lib import generate
|
8 |
+
|
9 |
+
HEAD = """
|
10 |
+
<style>
|
11 |
+
@media (min-width: 1536px) {
|
12 |
+
gradio-app > .gradio-container { max-width: 1280px !important }
|
13 |
+
}
|
14 |
+
</style>
|
15 |
+
"""
|
16 |
+
|
17 |
+
TITLE = """
|
18 |
+
<h1>Text Generation</h1>
|
19 |
+
"""
|
20 |
+
|
21 |
+
SEED = 0
|
22 |
+
PORT = 7860
|
23 |
+
|
24 |
+
if gr.NO_RELOAD:
|
25 |
+
random.seed(SEED)
|
26 |
+
np.random.seed(SEED)
|
27 |
+
torch.manual_seed(SEED)
|
28 |
+
|
29 |
+
# https://github.com/gradio-app/gradio/blob/main/gradio/chat_interface.py
|
30 |
+
chat_interface = gr.ChatInterface(
|
31 |
+
title=None,
|
32 |
+
fn=generate,
|
33 |
+
type="messages", # interface type must match bot type
|
34 |
+
description="Simple app for small language model inference.",
|
35 |
+
chatbot=gr.Chatbot(type="messages", show_label=False, height=None, scale=1),
|
36 |
+
textbox=gr.Textbox(placeholder="Type a message...", autofocus=True, scale=7),
|
37 |
+
additional_inputs=[
|
38 |
+
gr.Textbox(
|
39 |
+
label="System Prompt",
|
40 |
+
lines=2,
|
41 |
+
value="You are a helpful assistant. Be concise and precise.",
|
42 |
+
),
|
43 |
+
gr.Dropdown(
|
44 |
+
label="Model",
|
45 |
+
filterable=False,
|
46 |
+
value="HuggingFaceTB/SmolLM2-135M-Instruct",
|
47 |
+
choices=[
|
48 |
+
"01-ai/Yi-Coder-1.5B-Chat",
|
49 |
+
"google/gemma-2-2b-it",
|
50 |
+
"hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4",
|
51 |
+
"HuggingFaceTB/SmolLM2-135M-Instruct",
|
52 |
+
"HuggingFaceTB/SmolLM2-360M-Instruct",
|
53 |
+
"HuggingFaceTB/SmolLM2-1.7B-Instruct",
|
54 |
+
"meta-llama/Llama-3.2-1B-Instruct",
|
55 |
+
"Qwen/Qwen2.5-0.5B-Instruct",
|
56 |
+
"Qwen/Qwen2.5-Coder-1.5B-Instruct",
|
57 |
+
"THUDM/glm-edge-1.5b-chat",
|
58 |
+
],
|
59 |
+
),
|
60 |
+
gr.Slider(
|
61 |
+
label="Max new tokens",
|
62 |
+
minimum=1,
|
63 |
+
maximum=2048,
|
64 |
+
step=1,
|
65 |
+
value=512,
|
66 |
+
info="Maximum number of new tokens to generate.",
|
67 |
+
),
|
68 |
+
gr.Slider(
|
69 |
+
label="Temperature",
|
70 |
+
minimum=0.1,
|
71 |
+
maximum=2.0,
|
72 |
+
step=0.1,
|
73 |
+
value=0.6,
|
74 |
+
info="Modulates next token probabilities.",
|
75 |
+
),
|
76 |
+
gr.Slider(
|
77 |
+
label="Repetition penalty",
|
78 |
+
minimum=1.0,
|
79 |
+
maximum=2.0,
|
80 |
+
step=0.05,
|
81 |
+
value=1.2,
|
82 |
+
info="Penalizes repeating tokens.",
|
83 |
+
),
|
84 |
+
gr.Slider(
|
85 |
+
label="Top-p",
|
86 |
+
minimum=0.05,
|
87 |
+
maximum=1.0,
|
88 |
+
step=0.05,
|
89 |
+
value=0.9,
|
90 |
+
info="Only tokens with cumulative probability p are considered (nucleus sampling).",
|
91 |
+
),
|
92 |
+
gr.Slider(
|
93 |
+
label="Top-k",
|
94 |
+
minimum=1,
|
95 |
+
maximum=100,
|
96 |
+
step=1,
|
97 |
+
value=50,
|
98 |
+
info="Only k-th highest probability tokens are considered.",
|
99 |
+
),
|
100 |
+
],
|
101 |
+
)
|
102 |
+
|
103 |
|
104 |
+
with gr.Blocks(head=HEAD, fill_height=True) as demo:
|
105 |
+
gr.HTML(TITLE)
|
106 |
+
chat_interface.render()
|
107 |
|
108 |
+
if __name__ == "__main__":
|
109 |
+
demo.queue(default_concurrency_limit=1).launch(
|
110 |
+
server_name="0.0.0.0",
|
111 |
+
server_port=PORT,
|
112 |
+
)
|
lib/__init__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
from .generate import generate
|
2 |
+
|
3 |
+
__all__ = ["generate"]
|
lib/generate.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from threading import Thread
|
2 |
+
from typing import Iterator
|
3 |
+
|
4 |
+
import torch
|
5 |
+
from gradio import Error, Progress
|
6 |
+
from spaces import GPU, config
|
7 |
+
from transformers import TextIteratorStreamer
|
8 |
+
|
9 |
+
from .loader import get_loader
|
10 |
+
|
11 |
+
|
12 |
+
@GPU
|
13 |
+
def generate(
|
14 |
+
message: str,
|
15 |
+
chat_history: list[dict[str, str]],
|
16 |
+
system_prompt="",
|
17 |
+
model="HuggingFaceTB/SmolLM2-135M-Instruct",
|
18 |
+
max_tokens=512,
|
19 |
+
temperature=0.6,
|
20 |
+
repetition_penalty=1.2,
|
21 |
+
top_p=0.9,
|
22 |
+
top_k=50,
|
23 |
+
_=Progress(track_tqdm=True),
|
24 |
+
) -> Iterator[str]:
|
25 |
+
if not torch.cuda.is_available():
|
26 |
+
raise Error("CUDA not available")
|
27 |
+
|
28 |
+
# Prepend system prompt
|
29 |
+
if not chat_history or chat_history[0].get("role") != "system":
|
30 |
+
chat_history.insert(0, {"role": "system", "content": system_prompt})
|
31 |
+
else:
|
32 |
+
chat_history[0]["content"] = system_prompt
|
33 |
+
|
34 |
+
# Append user message before generating
|
35 |
+
chat_history.append({"role": "user", "content": message})
|
36 |
+
|
37 |
+
yield from transformers_generate(
|
38 |
+
chat_history,
|
39 |
+
model,
|
40 |
+
max_tokens,
|
41 |
+
temperature,
|
42 |
+
repetition_penalty,
|
43 |
+
top_p,
|
44 |
+
top_k,
|
45 |
+
)
|
46 |
+
|
47 |
+
|
48 |
+
def transformers_generate(
|
49 |
+
chat_history: list[dict[str, str]],
|
50 |
+
model: str,
|
51 |
+
max_tokens: int,
|
52 |
+
temperature: float,
|
53 |
+
repetition_penalty: float,
|
54 |
+
top_p: float,
|
55 |
+
top_k: int,
|
56 |
+
) -> Iterator[str]:
|
57 |
+
loader = get_loader(singleton=not config.Config.zero_gpu)
|
58 |
+
loader.load(model)
|
59 |
+
|
60 |
+
llm = loader.llm
|
61 |
+
tokenizer = loader.tokenizer
|
62 |
+
|
63 |
+
# Handle models that don't have a padding token
|
64 |
+
if tokenizer.pad_token_id is None:
|
65 |
+
tokenizer.pad_token_id = tokenizer.eos_token_id
|
66 |
+
|
67 |
+
# https://huggingface.co/docs/transformers/main/en/internal/tokenization_utils#transformers.PreTrainedTokenizerBase.apply_chat_template
|
68 |
+
results = tokenizer.apply_chat_template(
|
69 |
+
chat_history,
|
70 |
+
tokenize=True,
|
71 |
+
return_dict=True, # get the attention mask
|
72 |
+
return_tensors="pt",
|
73 |
+
# https://huggingface.co/docs/transformers/chat_templating#what-are-generation-prompts
|
74 |
+
add_generation_prompt=True,
|
75 |
+
)
|
76 |
+
|
77 |
+
input_ids = results["input_ids"].to(llm.device)
|
78 |
+
attention_mask = results["attention_mask"].to(llm.device)
|
79 |
+
|
80 |
+
streamer = TextIteratorStreamer(
|
81 |
+
tokenizer,
|
82 |
+
skip_prompt=True,
|
83 |
+
skip_special_tokens=True,
|
84 |
+
)
|
85 |
+
|
86 |
+
generate_kwargs = dict(
|
87 |
+
do_sample=True,
|
88 |
+
streamer=streamer,
|
89 |
+
input_ids=input_ids,
|
90 |
+
attention_mask=attention_mask,
|
91 |
+
pad_token_id=tokenizer.pad_token_id,
|
92 |
+
top_p=top_p,
|
93 |
+
top_k=top_k,
|
94 |
+
temperature=temperature,
|
95 |
+
max_new_tokens=max_tokens,
|
96 |
+
repetition_penalty=repetition_penalty,
|
97 |
+
)
|
98 |
+
|
99 |
+
# Stream text off the main thread
|
100 |
+
t = Thread(target=llm.generate, kwargs=generate_kwargs)
|
101 |
+
t.start()
|
102 |
+
|
103 |
+
# Collect output tokens
|
104 |
+
outputs = []
|
105 |
+
for text in streamer:
|
106 |
+
outputs.append(text)
|
107 |
+
yield "".join(outputs)
|
lib/loader.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
import torch
|
4 |
+
from transformers import (
|
5 |
+
AutoConfig,
|
6 |
+
Gemma2ForCausalLM,
|
7 |
+
GemmaTokenizer,
|
8 |
+
GlmForCausalLM,
|
9 |
+
GPT2Tokenizer,
|
10 |
+
LlamaForCausalLM,
|
11 |
+
LlamaTokenizer,
|
12 |
+
PreTrainedTokenizerFast,
|
13 |
+
Qwen2ForCausalLM,
|
14 |
+
Qwen2Tokenizer,
|
15 |
+
)
|
16 |
+
|
17 |
+
|
18 |
+
class Loader:
|
19 |
+
def __init__(self):
|
20 |
+
self.model = ""
|
21 |
+
self.llm = None
|
22 |
+
self.tokenizer = None
|
23 |
+
|
24 |
+
def load(self, model):
|
25 |
+
if model != self.model:
|
26 |
+
token = os.getenv("HF_TOKEN", None)
|
27 |
+
cuda_capability = torch.cuda.get_device_capability()[0]
|
28 |
+
|
29 |
+
# Set device_map and low_cpu_mem_usage to stream weights from disk to GPU with Accelerate
|
30 |
+
# See https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_utils.py
|
31 |
+
kwargs = {
|
32 |
+
"token": token,
|
33 |
+
"device_map": "auto",
|
34 |
+
"low_cpu_mem_usage": True,
|
35 |
+
"torch_dtype": torch.bfloat16 if cuda_capability >= 8 else torch.float16,
|
36 |
+
}
|
37 |
+
model_fns = {
|
38 |
+
# Could have used auto-classes or a pipeline
|
39 |
+
"01-ai/Yi-Coder-1.5B-Chat": LlamaForCausalLM.from_pretrained,
|
40 |
+
"google/gemma-2-2b-it": Gemma2ForCausalLM.from_pretrained,
|
41 |
+
"hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4": LlamaForCausalLM.from_pretrained,
|
42 |
+
"HuggingFaceTB/SmolLM2-135M-Instruct": LlamaForCausalLM.from_pretrained,
|
43 |
+
"HuggingFaceTB/SmolLM2-360M-Instruct": LlamaForCausalLM.from_pretrained,
|
44 |
+
"HuggingFaceTB/SmolLM2-1.7B-Instruct": LlamaForCausalLM.from_pretrained,
|
45 |
+
"meta-llama/Llama-3.2-1B-Instruct": LlamaForCausalLM.from_pretrained,
|
46 |
+
"Qwen/Qwen2.5-0.5B-Instruct": Qwen2ForCausalLM.from_pretrained,
|
47 |
+
"Qwen/Qwen2.5-Coder-1.5B-Instruct": Qwen2ForCausalLM.from_pretrained,
|
48 |
+
"THUDM/glm-edge-1.5b-chat": GlmForCausalLM.from_pretrained,
|
49 |
+
}
|
50 |
+
model_tokenizers = {
|
51 |
+
"01-ai/Yi-Coder-1.5B-Chat": LlamaTokenizer,
|
52 |
+
"google/gemma-2-2b-it": GemmaTokenizer,
|
53 |
+
"hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4": PreTrainedTokenizerFast,
|
54 |
+
"HuggingFaceTB/SmolLM2-135M-Instruct": GPT2Tokenizer,
|
55 |
+
"HuggingFaceTB/SmolLM2-360M-Instruct": GPT2Tokenizer,
|
56 |
+
"HuggingFaceTB/SmolLM2-1.7B-Instruct": GPT2Tokenizer,
|
57 |
+
"meta-llama/Llama-3.2-1B-Instruct": PreTrainedTokenizerFast,
|
58 |
+
"Qwen/Qwen2.5-0.5B-Instruct": Qwen2Tokenizer,
|
59 |
+
"Qwen/Qwen2.5-Coder-1.5B-Instruct": Qwen2Tokenizer,
|
60 |
+
"THUDM/glm-edge-1.5b-chat": PreTrainedTokenizerFast,
|
61 |
+
}
|
62 |
+
|
63 |
+
llm_fn = model_fns[model]
|
64 |
+
self.tokenizer = model_tokenizers[model].from_pretrained(model)
|
65 |
+
|
66 |
+
if model == "hugging-quants/Meta-Llama-3.1-8B-Instruct-BNB-NF4":
|
67 |
+
# Remove unused settings
|
68 |
+
config = AutoConfig.from_pretrained(model)
|
69 |
+
for key in ["_load_in_4bit", "_load_in_8bit", "quant_method"]:
|
70 |
+
del config.quantization_config[key]
|
71 |
+
self.llm = llm_fn(model, config=config, **kwargs)
|
72 |
+
else:
|
73 |
+
self.llm = llm_fn(model, **kwargs)
|
74 |
+
|
75 |
+
self.llm.eval()
|
76 |
+
self.model = model
|
77 |
+
|
78 |
+
# Clean up
|
79 |
+
torch.cuda.empty_cache()
|
80 |
+
torch.cuda.ipc_collect()
|
81 |
+
torch.cuda.reset_peak_memory_stats()
|
82 |
+
torch.cuda.synchronize()
|
83 |
+
|
84 |
+
|
85 |
+
# Get a singleton or new instance
|
86 |
+
def get_loader(singleton=False):
|
87 |
+
if not singleton:
|
88 |
+
return Loader()
|
89 |
+
else:
|
90 |
+
if not hasattr(get_loader, "_instance"):
|
91 |
+
get_loader._instance = Loader()
|
92 |
+
return get_loader._instance
|
requirements.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accelerate
|
2 |
+
bitsandbytes
|
3 |
+
gradio==4.44.1
|
4 |
+
hf-transfer
|
5 |
+
numpy==1.26.4
|
6 |
+
ruff==0.6.9
|
7 |
+
sentencepiece
|
8 |
+
setuptools
|
9 |
+
spaces==0.30.4
|
10 |
+
torch==2.4.0
|
11 |
+
torchaudio==2.4.0
|
12 |
+
torchvision==0.19.0
|
13 |
+
transformers==4.46.3
|
ruff.toml
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
extend-include = ["*.ipynb"]
|
2 |
+
|
3 |
+
line-length = 110
|
4 |
+
|
5 |
+
[lint]
|
6 |
+
ignore = ["F401"]
|
7 |
+
|
8 |
+
[lint.per-file-ignores]
|
9 |
+
"*.ipynb" = ["E402"]
|