John6666 commited on
Commit
6200ce9
·
verified ·
1 Parent(s): 4bf5f6e

Upload 2 files

Browse files

- Fix abort error on Inference
- Fix abort error when HF_TOKEN is missing

Files changed (2) hide show
  1. app.py +182 -182
  2. requirements.txt +5 -5
app.py CHANGED
@@ -1,182 +1,182 @@
1
- import spaces
2
- import subprocess
3
- from llama_cpp import Llama
4
- from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
5
- from llama_cpp_agent.providers import LlamaCppPythonProvider
6
- from llama_cpp_agent.chat_history import BasicChatHistory
7
- from llama_cpp_agent.chat_history.messages import Roles
8
- import gradio as gr
9
- from huggingface_hub import hf_hub_download
10
- import os
11
- import cv2
12
-
13
- huggingface_token = os.environ['HF_TOKEN']
14
-
15
- # Download the Meta-Llama-3.1-8B-Instruct model
16
- hf_hub_download(
17
- repo_id="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
18
- filename="Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf",
19
- local_dir="./models",
20
- token=huggingface_token
21
- )
22
-
23
- hf_hub_download(
24
- repo_id="bartowski/Mistral-Nemo-Instruct-2407-GGUF",
25
- filename="Mistral-Nemo-Instruct-2407-Q5_K_M.gguf",
26
- local_dir="./models",
27
- token=huggingface_token
28
- )
29
-
30
- hf_hub_download(
31
- repo_id="bartowski/gemma-2-2b-it-GGUF",
32
- filename="gemma-2-2b-it-Q6_K_L.gguf",
33
- local_dir="./models",
34
- token=huggingface_token
35
- )
36
-
37
- hf_hub_download(
38
- repo_id="bartowski/openchat-3.6-8b-20240522-GGUF",
39
- filename="openchat-3.6-8b-20240522-Q6_K.gguf",
40
- local_dir="./models",
41
- token=huggingface_token
42
- )
43
-
44
- hf_hub_download(
45
- repo_id="bartowski/Llama-3-Groq-8B-Tool-Use-GGUF",
46
- filename="Llama-3-Groq-8B-Tool-Use-Q6_K.gguf",
47
- local_dir="./models",
48
- token=huggingface_token
49
- )
50
-
51
-
52
- llm = None
53
- llm_model = None
54
-
55
- cv2.setNumThreads(1)
56
-
57
- @spaces.GPU()
58
- def respond(
59
- message,
60
- history: list[tuple[str, str]],
61
- model,
62
- system_message,
63
- max_tokens,
64
- temperature,
65
- top_p,
66
- top_k,
67
- repeat_penalty,
68
- ):
69
- chat_template = MessagesFormatterType.GEMMA_2
70
-
71
- global llm
72
- global llm_model
73
-
74
- # Load model only if it's not already loaded or if a new model is selected
75
- if llm is None or llm_model != model:
76
- try:
77
- llm = Llama(
78
- model_path=f"models/{model}",
79
- flash_attn=True,
80
- n_gpu_layers=81, # Adjust based on available GPU resources
81
- n_batch=1024,
82
- n_ctx=8192,
83
- )
84
- llm_model = model
85
- except Exception as e:
86
- return f"Error loading model: {str(e)}"
87
-
88
- provider = LlamaCppPythonProvider(llm)
89
-
90
- agent = LlamaCppAgent(
91
- provider,
92
- system_prompt=f"{system_message}",
93
- predefined_messages_formatter_type=chat_template,
94
- debug_output=True
95
- )
96
-
97
- settings = provider.get_provider_default_settings()
98
- settings.temperature = temperature
99
- settings.top_k = top_k
100
- settings.top_p = top_p
101
- settings.max_tokens = max_tokens
102
- settings.repeat_penalty = repeat_penalty
103
- settings.stream = True
104
-
105
- messages = BasicChatHistory()
106
-
107
- # Add user and assistant messages to the history
108
- for msn in history:
109
- user = {'role': Roles.user, 'content': msn[0]}
110
- assistant = {'role': Roles.assistant, 'content': msn[1]}
111
- messages.add_message(user)
112
- messages.add_message(assistant)
113
-
114
- # Stream the response
115
- try:
116
- stream = agent.get_chat_response(
117
- message,
118
- llm_sampling_settings=settings,
119
- chat_history=messages,
120
- returns_streaming_generator=True,
121
- print_output=False
122
- )
123
-
124
- outputs = ""
125
- for output in stream:
126
- outputs += output
127
- yield outputs
128
- except Exception as e:
129
- yield f"Error during response generation: {str(e)}"
130
-
131
- demo = gr.ChatInterface(
132
- respond,
133
- additional_inputs=[
134
- gr.Dropdown([
135
- 'Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf',
136
- 'Mistral-Nemo-Instruct-2407-Q5_K_M.gguf',
137
- 'gemma-2-2b-it-Q6_K_L.gguf',
138
- 'openchat-3.6-8b-20240522-Q6_K.gguf',
139
- 'Llama-3-Groq-8B-Tool-Use-Q6_K.gguf'
140
- ],
141
- value="gemma-2-2b-it-Q6_K_L.gguf",
142
- label="Model"
143
- ),
144
- gr.Textbox(value="You are a helpful assistant.", label="System message"),
145
- gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
146
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
147
- gr.Slider(
148
- minimum=0.1,
149
- maximum=1.0,
150
- value=0.95,
151
- step=0.05,
152
- label="Top-p",
153
- ),
154
- gr.Slider(
155
- minimum=0,
156
- maximum=100,
157
- value=40,
158
- step=1,
159
- label="Top-k",
160
- ),
161
- gr.Slider(
162
- minimum=0.0,
163
- maximum=2.0,
164
- value=1.1,
165
- step=0.1,
166
- label="Repetition penalty",
167
- ),
168
- ],
169
- retry_btn="Retry",
170
- undo_btn="Undo",
171
- clear_btn="Clear",
172
- submit_btn="Send",
173
- title="Chat with lots of Models and LLMs using llama.cpp",
174
- chatbot=gr.Chatbot(
175
- scale=1,
176
- likeable=False,
177
- show_copy_button=True
178
- )
179
- )
180
-
181
- if __name__ == "__main__":
182
- demo.launch()
 
1
+ import spaces
2
+ import subprocess
3
+ from llama_cpp import Llama
4
+ from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType
5
+ from llama_cpp_agent.providers import LlamaCppPythonProvider
6
+ from llama_cpp_agent.chat_history import BasicChatHistory
7
+ from llama_cpp_agent.chat_history.messages import Roles
8
+ import gradio as gr
9
+ from huggingface_hub import hf_hub_download
10
+ import os
11
+ import cv2
12
+
13
+ huggingface_token = os.environ.get('HF_TOKEN')
14
+
15
+ # Download the Meta-Llama-3.1-8B-Instruct model
16
+ hf_hub_download(
17
+ repo_id="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF",
18
+ filename="Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf",
19
+ local_dir="./models",
20
+ token=huggingface_token
21
+ )
22
+
23
+ hf_hub_download(
24
+ repo_id="bartowski/Mistral-Nemo-Instruct-2407-GGUF",
25
+ filename="Mistral-Nemo-Instruct-2407-Q5_K_M.gguf",
26
+ local_dir="./models",
27
+ token=huggingface_token
28
+ )
29
+
30
+ hf_hub_download(
31
+ repo_id="bartowski/gemma-2-2b-it-GGUF",
32
+ filename="gemma-2-2b-it-Q6_K_L.gguf",
33
+ local_dir="./models",
34
+ token=huggingface_token
35
+ )
36
+
37
+ hf_hub_download(
38
+ repo_id="bartowski/openchat-3.6-8b-20240522-GGUF",
39
+ filename="openchat-3.6-8b-20240522-Q6_K.gguf",
40
+ local_dir="./models",
41
+ token=huggingface_token
42
+ )
43
+
44
+ hf_hub_download(
45
+ repo_id="bartowski/Llama-3-Groq-8B-Tool-Use-GGUF",
46
+ filename="Llama-3-Groq-8B-Tool-Use-Q6_K.gguf",
47
+ local_dir="./models",
48
+ token=huggingface_token
49
+ )
50
+
51
+
52
+ llm = None
53
+ llm_model = None
54
+
55
+ cv2.setNumThreads(1)
56
+
57
+ @spaces.GPU()
58
+ def respond(
59
+ message,
60
+ history: list[tuple[str, str]],
61
+ model,
62
+ system_message,
63
+ max_tokens,
64
+ temperature,
65
+ top_p,
66
+ top_k,
67
+ repeat_penalty,
68
+ ):
69
+ chat_template = MessagesFormatterType.GEMMA_2
70
+
71
+ global llm
72
+ global llm_model
73
+
74
+ # Load model only if it's not already loaded or if a new model is selected
75
+ if llm is None or llm_model != model:
76
+ try:
77
+ llm = Llama(
78
+ model_path=f"models/{model}",
79
+ flash_attn=True,
80
+ n_gpu_layers=81, # Adjust based on available GPU resources
81
+ n_batch=1024,
82
+ n_ctx=8192,
83
+ )
84
+ llm_model = model
85
+ except Exception as e:
86
+ return f"Error loading model: {str(e)}"
87
+
88
+ provider = LlamaCppPythonProvider(llm)
89
+
90
+ agent = LlamaCppAgent(
91
+ provider,
92
+ system_prompt=f"{system_message}",
93
+ predefined_messages_formatter_type=chat_template,
94
+ debug_output=True
95
+ )
96
+
97
+ settings = provider.get_provider_default_settings()
98
+ settings.temperature = temperature
99
+ settings.top_k = top_k
100
+ settings.top_p = top_p
101
+ settings.max_tokens = max_tokens
102
+ settings.repeat_penalty = repeat_penalty
103
+ settings.stream = True
104
+
105
+ messages = BasicChatHistory()
106
+
107
+ # Add user and assistant messages to the history
108
+ for msn in history:
109
+ user = {'role': Roles.user, 'content': msn[0]}
110
+ assistant = {'role': Roles.assistant, 'content': msn[1]}
111
+ messages.add_message(user)
112
+ messages.add_message(assistant)
113
+
114
+ # Stream the response
115
+ try:
116
+ stream = agent.get_chat_response(
117
+ message,
118
+ llm_sampling_settings=settings,
119
+ chat_history=messages,
120
+ returns_streaming_generator=True,
121
+ print_output=False
122
+ )
123
+
124
+ outputs = ""
125
+ for output in stream:
126
+ outputs += output
127
+ yield outputs
128
+ except Exception as e:
129
+ yield f"Error during response generation: {str(e)}"
130
+
131
+ demo = gr.ChatInterface(
132
+ fn=respond,
133
+ additional_inputs=[
134
+ gr.Dropdown([
135
+ 'Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf',
136
+ 'Mistral-Nemo-Instruct-2407-Q5_K_M.gguf',
137
+ 'gemma-2-2b-it-Q6_K_L.gguf',
138
+ 'openchat-3.6-8b-20240522-Q6_K.gguf',
139
+ 'Llama-3-Groq-8B-Tool-Use-Q6_K.gguf'
140
+ ],
141
+ value="gemma-2-2b-it-Q6_K_L.gguf",
142
+ label="Model"
143
+ ),
144
+ gr.Textbox(value="You are a helpful assistant.", label="System message"),
145
+ gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"),
146
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
147
+ gr.Slider(
148
+ minimum=0.1,
149
+ maximum=1.0,
150
+ value=0.95,
151
+ step=0.05,
152
+ label="Top-p",
153
+ ),
154
+ gr.Slider(
155
+ minimum=0,
156
+ maximum=100,
157
+ value=40,
158
+ step=1,
159
+ label="Top-k",
160
+ ),
161
+ gr.Slider(
162
+ minimum=0.0,
163
+ maximum=2.0,
164
+ value=1.1,
165
+ step=0.1,
166
+ label="Repetition penalty",
167
+ ),
168
+ ],
169
+ retry_btn="Retry",
170
+ undo_btn="Undo",
171
+ clear_btn="Clear",
172
+ submit_btn="Send",
173
+ title="Chat with lots of Models and LLMs using llama.cpp",
174
+ chatbot=gr.Chatbot(
175
+ scale=1,
176
+ likeable=False,
177
+ show_copy_button=True
178
+ )
179
+ )
180
+
181
+ if __name__ == "__main__":
182
+ demo.launch()
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
- gradio
2
- spaces
3
- llama-cpp-python
4
- llama-cpp-agent
5
- huggingface_hub
6
  opencv-python
 
1
+ spaces
2
+ huggingface_hub
3
+ scikit-build-core
4
+ https://github.com/abetlen/llama-cpp-python/releases/download/v0.2.90-cu124/llama_cpp_python-0.2.90-cp310-cp310-linux_x86_64.whl
5
+ git+https://github.com/Maximilian-Winter/llama-cpp-agent
6
  opencv-python