unclemusclez commited on
Commit
09bfa15
·
verified ·
1 Parent(s): 9daf35d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -289
app.py CHANGED
@@ -17,81 +17,15 @@ from apscheduler.schedulers.background import BackgroundScheduler
17
  from textwrap import dedent
18
 
19
  HF_TOKEN = os.environ.get("HF_TOKEN")
 
 
20
 
21
- def generate_importance_matrix(model_path, train_data_path):
22
- imatrix_command = f"./llama-imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
23
 
24
- os.chdir("llama.cpp")
25
-
26
- print(f"Current working directory: {os.getcwd()}")
27
- print(f"Files in the current directory: {os.listdir('.')}")
28
-
29
- if not os.path.isfile(f"../{model_path}"):
30
- raise Exception(f"Model file not found: {model_path}")
31
-
32
- print("Running imatrix command...")
33
- process = subprocess.Popen(imatrix_command, shell=True)
34
-
35
- try:
36
- process.wait(timeout=60) # added wait
37
- except subprocess.TimeoutExpired:
38
- print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
39
- process.send_signal(signal.SIGINT)
40
- try:
41
- process.wait(timeout=5) # grace period
42
- except subprocess.TimeoutExpired:
43
- print("Imatrix proc still didn't term. Forecfully terming process...")
44
- process.kill()
45
-
46
- os.chdir("..")
47
-
48
- print("Importance matrix generation completed.")
49
-
50
- def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
51
- if oauth_token.token is None:
52
- raise ValueError("You have to be logged in.")
53
-
54
- split_cmd = f"llama.cpp/llama-gguf-split --split --split-max-tensors {split_max_tensors}"
55
- if split_max_size:
56
- split_cmd += f" --split-max-size {split_max_size}"
57
- split_cmd += f" {model_path} {model_path.split('.')[0]}"
58
-
59
- print(f"Split command: {split_cmd}")
60
-
61
- result = subprocess.run(split_cmd, shell=True, capture_output=True, text=True)
62
- print(f"Split command stdout: {result.stdout}")
63
- print(f"Split command stderr: {result.stderr}")
64
-
65
- if result.returncode != 0:
66
- raise Exception(f"Error splitting the model: {result.stderr}")
67
- print("Model split successfully!")
68
-
69
-
70
- sharded_model_files = [f for f in os.listdir('.') if f.startswith(model_path.split('.')[0])]
71
- if sharded_model_files:
72
- print(f"Sharded model files: {sharded_model_files}")
73
- api = HfApi(token=oauth_token.token)
74
- for file in sharded_model_files:
75
- file_path = os.path.join('.', file)
76
- print(f"Uploading file: {file_path}")
77
- try:
78
- api.upload_file(
79
- path_or_fileobj=file_path,
80
- path_in_repo=file,
81
- repo_id=repo_id,
82
- )
83
- except Exception as e:
84
- raise Exception(f"Error uploading file {file_path}: {e}")
85
- else:
86
- raise Exception("No sharded files found.")
87
-
88
- print("Sharded model has been uploaded successfully!")
89
-
90
- def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
91
  if oauth_token.token is None:
92
  raise ValueError("You must be logged in to use GGUF-my-repo")
93
  model_name = model_id.split('/')[-1]
94
- fp16 = f"{model_name}.fp16.gguf"
95
 
96
  try:
97
  api = HfApi(token=oauth_token.token)
@@ -117,156 +51,26 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
117
  print(f"Current working directory: {os.getcwd()}")
118
  print(f"Model directory contents: {os.listdir(model_name)}")
119
 
120
- conversion_script = "convert-hf-to-gguf.py"
121
- fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
122
- result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
 
 
 
 
123
  print(result)
124
  if result.returncode != 0:
125
- raise Exception(f"Error converting to fp16: {result.stderr}")
126
- print("Model converted to fp16 successfully!")
127
- print(f"Converted model path: {fp16}")
128
-
129
- imatrix_path = "llama.cpp/imatrix.dat"
130
-
131
- if use_imatrix:
132
- if train_data_file:
133
- train_data_path = train_data_file.name
134
- else:
135
- train_data_path = "groups_merged.txt" #fallback calibration dataset
136
-
137
- print(f"Training data file path: {train_data_path}")
138
-
139
- if not os.path.isfile(train_data_path):
140
- raise Exception(f"Training data file not found: {train_data_path}")
141
-
142
- generate_importance_matrix(fp16, train_data_path)
143
- else:
144
- print("Not using imatrix quantization.")
145
- username = whoami(oauth_token.token)["name"]
146
- quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
147
- quantized_gguf_path = quantized_gguf_name
148
- if use_imatrix:
149
- quantise_ggml = f"./llama.cpp/llama-quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
150
- else:
151
- quantise_ggml = f"./llama.cpp/llama-quantize {fp16} {quantized_gguf_path} {q_method}"
152
- result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
153
- if result.returncode != 0:
154
- raise Exception(f"Error quantizing: {result.stderr}")
155
- print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
156
- print(f"Quantized model path: {quantized_gguf_path}")
157
-
158
- # Create empty repo
159
- new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
160
- new_repo_id = new_repo_url.repo_id
161
- print("Repo created successfully!", new_repo_url)
162
-
163
- try:
164
- card = ModelCard.load(model_id, token=oauth_token.token)
165
- except:
166
- card = ModelCard("")
167
- if card.data.tags is None:
168
- card.data.tags = []
169
- card.data.tags.append("llama-cpp")
170
- card.data.tags.append("gguf-my-repo")
171
- card.data.base_model = model_id
172
- card.text = dedent(
173
- f"""
174
- # {new_repo_id}
175
- This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
176
- Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
177
-
178
- ## Use with llama.cpp
179
- Install llama.cpp through brew (works on Mac and Linux)
180
-
181
- ```bash
182
- brew install llama.cpp
183
-
184
- ```
185
- Invoke the llama.cpp server or the CLI.
186
-
187
- ### CLI:
188
- ```bash
189
- llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
190
- ```
191
-
192
- ### Server:
193
- ```bash
194
- llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
195
- ```
196
-
197
- Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
198
-
199
- Step 1: Clone llama.cpp from GitHub.
200
- ```
201
- git clone https://github.com/ggerganov/llama.cpp
202
- ```
203
-
204
- Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
205
- ```
206
- cd llama.cpp && LLAMA_CURL=1 make
207
- ```
208
-
209
- Step 3: Run inference through the main binary.
210
- ```
211
- ./llama-cli --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
212
- ```
213
- or
214
- ```
215
- ./llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
216
- ```
217
- """
218
- )
219
- card.save(f"README.md")
220
-
221
- if split_model:
222
- split_upload_model(quantized_gguf_path, new_repo_id, oauth_token, split_max_tensors, split_max_size)
223
- else:
224
- try:
225
- print(f"Uploading quantized model: {quantized_gguf_path}")
226
- api.upload_file(
227
- path_or_fileobj=quantized_gguf_path,
228
- path_in_repo=quantized_gguf_name,
229
- repo_id=new_repo_id,
230
- )
231
- except Exception as e:
232
- raise Exception(f"Error uploading quantized model: {e}")
233
-
234
-
235
- imatrix_path = "llama.cpp/imatrix.dat"
236
- if os.path.isfile(imatrix_path):
237
- try:
238
- print(f"Uploading imatrix.dat: {imatrix_path}")
239
- api.upload_file(
240
- path_or_fileobj=imatrix_path,
241
- path_in_repo="imatrix.dat",
242
- repo_id=new_repo_id,
243
- )
244
- except Exception as e:
245
- raise Exception(f"Error uploading imatrix.dat: {e}")
246
-
247
- api.upload_file(
248
- path_or_fileobj=f"README.md",
249
- path_in_repo=f"README.md",
250
- repo_id=new_repo_id,
251
- )
252
- print(f"Uploaded successfully with {imatrix_q_method if use_imatrix else q_method} option!")
253
-
254
- return (
255
- f'Find your repo <a href=\'{new_repo_url}\' target="_blank" style="text-decoration:underline">here</a>',
256
- "llama.png",
257
- )
258
- except Exception as e:
259
- return (f"Error: {e}", "error.png")
260
- finally:
261
- shutil.rmtree(model_name, ignore_errors=True)
262
- print("Folder cleaned up successfully!")
263
 
264
  css="""/* Custom CSS to allow scrolling */
265
  .gradio-container {overflow-y: auto;}
266
  """
267
  # Create Gradio interface
268
  with gr.Blocks(css=css) as demo:
269
- gr.Markdown("You must be logged in to use GGUF-my-repo.")
270
  gr.LoginButton(min_width=250)
271
 
272
  model_id = HuggingfaceHubSearch(
@@ -275,103 +79,33 @@ with gr.Blocks(css=css) as demo:
275
  search_type="model",
276
  )
277
 
278
- q_method = gr.Dropdown(
279
- ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
 
280
  label="Quantization Method",
281
- info="GGML quantization type",
282
- value="Q4_K_M",
283
  filterable=False,
284
  visible=True
285
  )
286
 
287
- imatrix_q_method = gr.Dropdown(
288
- ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
289
- label="Imatrix Quantization Method",
290
- info="GGML imatrix quants type",
291
- value="IQ4_NL",
292
- filterable=False,
293
- visible=False
294
- )
295
-
296
- use_imatrix = gr.Checkbox(
297
- value=False,
298
- label="Use Imatrix Quantization",
299
- info="Use importance matrix for quantization."
300
- )
301
-
302
- private_repo = gr.Checkbox(
303
- value=False,
304
- label="Private Repo",
305
- info="Create a private repo under your username."
306
- )
307
-
308
- train_data_file = gr.File(
309
- label="Training Data File",
310
- file_types=["txt"],
311
- visible=False
312
- )
313
-
314
- split_model = gr.Checkbox(
315
- value=False,
316
- label="Split Model",
317
- info="Shard the model using gguf-split."
318
- )
319
-
320
- split_max_tensors = gr.Number(
321
- value=256,
322
- label="Max Tensors per File",
323
- info="Maximum number of tensors per file when splitting model.",
324
- visible=False
325
- )
326
-
327
- split_max_size = gr.Textbox(
328
- label="Max File Size",
329
- info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default.",
330
- visible=False
331
- )
332
-
333
- def update_visibility(use_imatrix):
334
- return gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)
335
-
336
- use_imatrix.change(
337
- fn=update_visibility,
338
- inputs=use_imatrix,
339
- outputs=[q_method, imatrix_q_method, train_data_file]
340
- )
341
-
342
  iface = gr.Interface(
343
  fn=process_model,
344
  inputs=[
345
  model_id,
346
  q_method,
347
- use_imatrix,
348
- imatrix_q_method,
349
- private_repo,
350
- train_data_file,
351
- split_model,
352
- split_max_tensors,
353
- split_max_size,
354
  ],
355
  outputs=[
356
  gr.Markdown(label="output"),
357
  gr.Image(show_label=False),
358
  ],
359
- title="Create your own GGUF Quants, blazingly fast ⚡!",
360
  description="The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.",
361
  api_name=False
362
  )
363
 
364
- def update_split_visibility(split_model):
365
- return gr.update(visible=split_model), gr.update(visible=split_model)
366
-
367
- split_model.change(
368
- fn=update_split_visibility,
369
- inputs=split_model,
370
- outputs=[split_max_tensors, split_max_size]
371
- )
372
-
373
  def restart_space():
374
- HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
375
 
376
  scheduler = BackgroundScheduler()
377
  scheduler.add_job(restart_space, "interval", seconds=21600)
 
17
  from textwrap import dedent
18
 
19
  HF_TOKEN = os.environ.get("HF_TOKEN")
20
+ OLLAMA_PUB = os.environ.get("OLLAMA_PUB")
21
+ OLLAMA_USERNAME = os.environ.get("OLLAMA_USERNAME")
22
 
 
 
23
 
24
+ def process_model(model_id, q_method, oauth_token: gr.OAuthToken | None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  if oauth_token.token is None:
26
  raise ValueError("You must be logged in to use GGUF-my-repo")
27
  model_name = model_id.split('/')[-1]
28
+ ollama = f"{model_name}.fp16.gguf"
29
 
30
  try:
31
  api = HfApi(token=oauth_token.token)
 
51
  print(f"Current working directory: {os.getcwd()}")
52
  print(f"Model directory contents: {os.listdir(model_name)}")
53
 
54
+ model_file = {model_name}_ollama
55
+ f = open("{model_file}", "w")
56
+ print(f.write("From {model_id}"))
57
+ ollama_conversion = f"ollama create -f {model_file} {OLLAMA_USERNAME}/{model_id}:{q_method}"
58
+ ollama_push = f"ollama push {OLLAMA_USERNAME}/{model_id}:{q_method}"
59
+
60
+ result = subprocess.run(ollama_conversion, shell=True, capture_output=True)
61
  print(result)
62
  if result.returncode != 0:
63
+ raise Exception(f"Error converting to Ollama: {result.stderr}")
64
+ print("Model converted to Ollama successfully!")
65
+ print(f"Converted model path: {ollama}")
66
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
  css="""/* Custom CSS to allow scrolling */
69
  .gradio-container {overflow-y: auto;}
70
  """
71
  # Create Gradio interface
72
  with gr.Blocks(css=css) as demo:
73
+ gr.Markdown("You must be logged in to use Ollamafy.")
74
  gr.LoginButton(min_width=250)
75
 
76
  model_id = HuggingfaceHubSearch(
 
79
  search_type="model",
80
  )
81
 
82
+ = gr.Dropdown(
83
+ ## ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
84
+ ["Q3_k_s", "Q3_k_m", "Q3_k_l", "Q4_0", "Q4_1", "Q4_k_s", "Q4_k_m", "Q5_0", "Q5_1", "Q5_k_s", "Q5_k_m", "Q6_k","Q8_0"],
85
  label="Quantization Method",
86
+ info="Ollama Quantization Types",
87
+ value="ALL",
88
  filterable=False,
89
  visible=True
90
  )
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  iface = gr.Interface(
93
  fn=process_model,
94
  inputs=[
95
  model_id,
96
  q_method,
 
 
 
 
 
 
 
97
  ],
98
  outputs=[
99
  gr.Markdown(label="output"),
100
  gr.Image(show_label=False),
101
  ],
102
+ title="Create your own Ollama Models and Push them to the Ollama Library, blazingly fast ⚡!",
103
  description="The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.",
104
  api_name=False
105
  )
106
 
 
 
 
 
 
 
 
 
 
107
  def restart_space():
108
+ HfApi().restart_space(repo_id="unclemusclez/ollamafy", token=HF_TOKEN, factory_reboot=True)
109
 
110
  scheduler = BackgroundScheduler()
111
  scheduler.add_job(restart_space, "interval", seconds=21600)