hdeldar commited on
Commit
431304e
1 Parent(s): 76abd46

change code versions

Browse files
README.md CHANGED
@@ -4,14 +4,13 @@ emoji: 🐶
4
  colorFrom: yellow
5
  colorTo: red
6
  sdk: gradio
7
- sdk_version: 3.8.1
8
  app_file: app.py
9
- pinned: false
10
  license: apache-2.0
11
  models: []
12
  datasets:
13
  - safetensors/conversions
14
- duplicated_from: safetensors/convert
15
  ---
16
 
17
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
4
  colorFrom: yellow
5
  colorTo: red
6
  sdk: gradio
7
+ sdk_version: 3.36.1
8
  app_file: app.py
9
+ pinned: true
10
  license: apache-2.0
11
  models: []
12
  datasets:
13
  - safetensors/conversions
 
14
  ---
15
 
16
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
__pycache__/app.cpython-310.pyc DELETED
Binary file (2.63 kB)
 
__pycache__/convert.cpython-310.pyc DELETED
Binary file (8.15 kB)
 
app.py CHANGED
@@ -15,7 +15,8 @@ DATA_FILE = os.path.join("data", DATA_FILENAME)
15
  HF_TOKEN = os.environ.get("HF_TOKEN")
16
 
17
  repo: Optional[Repository] = None
18
- if HF_TOKEN:
 
19
  repo = Repository(local_dir="data", clone_from=DATASET_REPO_URL, token=HF_TOKEN)
20
 
21
 
@@ -31,11 +32,12 @@ def run(token: str, model_id: str) -> str:
31
  is_private = api.model_info(repo_id=model_id).private
32
  print("is_private", is_private)
33
 
34
- commit_info = convert(api=api, model_id=model_id, force=True)
35
  print("[commit_info]", commit_info)
36
 
37
  # save in a (public) dataset:
38
- if repo is not None and not is_private:
 
39
  repo.git_pull(rebase=True)
40
  print("pulled")
41
  with open(DATA_FILE, "a") as csvfile:
@@ -52,13 +54,17 @@ def run(token: str, model_id: str) -> str:
52
  commit_url = repo.push_to_hub()
53
  print("[dataset]", commit_url)
54
 
55
- return f"""
56
  ### Success 🔥
57
 
58
  Yay! This model was successfully converted and a PR was open using your token, here:
59
 
60
  [{commit_info.pr_url}]({commit_info.pr_url})
61
  """
 
 
 
 
62
  except Exception as e:
63
  return f"""
64
  ### Error 😢😢😢
@@ -89,6 +95,6 @@ demo = gr.Interface(
89
  ],
90
  outputs=[gr.Markdown(label="output")],
91
  fn=run,
92
- )
93
 
94
- demo.launch()
 
15
  HF_TOKEN = os.environ.get("HF_TOKEN")
16
 
17
  repo: Optional[Repository] = None
18
+ # TODO
19
+ if False and HF_TOKEN:
20
  repo = Repository(local_dir="data", clone_from=DATASET_REPO_URL, token=HF_TOKEN)
21
 
22
 
 
32
  is_private = api.model_info(repo_id=model_id).private
33
  print("is_private", is_private)
34
 
35
+ commit_info, errors = convert(api=api, model_id=model_id)
36
  print("[commit_info]", commit_info)
37
 
38
  # save in a (public) dataset:
39
+ # TODO False because of LFS bug.
40
+ if False and repo is not None and not is_private:
41
  repo.git_pull(rebase=True)
42
  print("pulled")
43
  with open(DATA_FILE, "a") as csvfile:
 
54
  commit_url = repo.push_to_hub()
55
  print("[dataset]", commit_url)
56
 
57
+ string = f"""
58
  ### Success 🔥
59
 
60
  Yay! This model was successfully converted and a PR was open using your token, here:
61
 
62
  [{commit_info.pr_url}]({commit_info.pr_url})
63
  """
64
+ if errors:
65
+ string += "\nErrors during conversion:\n"
66
+ string += "\n".join(f"Error while converting {filename}: {e}, skipped conversion" for filename, e in errors)
67
+ return string
68
  except Exception as e:
69
  return f"""
70
  ### Error 😢😢😢
 
95
  ],
96
  outputs=[gr.Markdown(label="output")],
97
  fn=run,
98
+ ).queue(max_size=10, concurrency_count=1)
99
 
100
+ demo.launch(show_api=True)
convert.py CHANGED
@@ -5,7 +5,7 @@ import shutil
5
  from collections import defaultdict
6
  from inspect import signature
7
  from tempfile import TemporaryDirectory
8
- from typing import Dict, List, Optional, Set
9
 
10
  import torch
11
 
@@ -13,7 +13,26 @@ from huggingface_hub import CommitInfo, CommitOperationAdd, Discussion, HfApi, h
13
  from huggingface_hub.file_download import repo_folder_name
14
  from safetensors.torch import load_file, save_file
15
  from transformers import AutoConfig
16
- from transformers.pipelines.base import infer_framework_load_model
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
 
19
  class AlreadyExists(Exception):
@@ -51,15 +70,15 @@ def rename(pt_filename: str) -> str:
51
  return local
52
 
53
 
54
- def convert_multi(model_id: str, folder: str) -> List["CommitOperationAdd"]:
55
- filename = hf_hub_download(repo_id=model_id, filename="pytorch_model.bin.index.json")
56
  with open(filename, "r") as f:
57
  data = json.load(f)
58
 
59
  filenames = set(data["weight_map"].values())
60
  local_filenames = []
61
  for filename in filenames:
62
- pt_filename = hf_hub_download(repo_id=model_id, filename=filename)
63
 
64
  sf_filename = rename(pt_filename)
65
  sf_filename = os.path.join(folder, sf_filename)
@@ -77,18 +96,20 @@ def convert_multi(model_id: str, folder: str) -> List["CommitOperationAdd"]:
77
  operations = [
78
  CommitOperationAdd(path_in_repo=local.split("/")[-1], path_or_fileobj=local) for local in local_filenames
79
  ]
 
80
 
81
- return operations
82
 
83
 
84
- def convert_single(model_id: str, folder: str) -> List["CommitOperationAdd"]:
85
- pt_filename = hf_hub_download(repo_id=model_id, filename="pytorch_model.bin")
86
 
87
  sf_name = "model.safetensors"
88
  sf_filename = os.path.join(folder, sf_name)
89
  convert_file(pt_filename, sf_filename)
90
  operations = [CommitOperationAdd(path_in_repo=sf_name, path_or_fileobj=sf_filename)]
91
- return operations
 
92
 
93
 
94
  def convert_file(
@@ -133,24 +154,104 @@ def create_diff(pt_infos: Dict[str, List[str]], sf_infos: Dict[str, List[str]])
133
  errors.append(f"{key} : SF warnings contain {sf_only} which are not present in PT warnings")
134
  return "\n".join(errors)
135
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
  def previous_pr(api: "HfApi", model_id: str, pr_title: str) -> Optional["Discussion"]:
137
  try:
 
138
  discussions = api.get_repo_discussions(repo_id=model_id)
139
  except Exception:
140
  return None
141
  for discussion in discussions:
142
  if discussion.status == "open" and discussion.is_pull_request and discussion.title == pr_title:
143
- return discussion
144
 
 
 
 
145
 
146
- def convert_generic(model_id: str, folder: str, filenames: Set[str]) -> List["CommitOperationAdd"]:
 
147
  operations = []
 
148
 
149
  extensions = set([".bin", ".ckpt"])
150
  for filename in filenames:
151
  prefix, ext = os.path.splitext(filename)
152
  if ext in extensions:
153
- pt_filename = hf_hub_download(model_id, filename=filename)
154
  dirname, raw_filename = os.path.split(filename)
155
  if raw_filename == "pytorch_model.bin":
156
  # XXX: This is a special case to handle `transformers` and the
@@ -159,20 +260,19 @@ def convert_generic(model_id: str, folder: str, filenames: Set[str]) -> List["Co
159
  else:
160
  sf_in_repo = f"{prefix}.safetensors"
161
  sf_filename = os.path.join(folder, sf_in_repo)
162
- convert_file(pt_filename, sf_filename)
163
- operations.append(CommitOperationAdd(path_in_repo=sf_in_repo, path_or_fileobj=sf_filename))
164
- return operations
 
 
 
165
 
166
 
167
- def convert(api: "HfApi", model_id: str, force: bool = False) -> Optional["CommitInfo"]:
168
  pr_title = "Adding `safetensors` variant of this model"
169
  info = api.model_info(model_id)
 
170
 
171
- def is_valid_filename(filename):
172
- return len(filename.split("/")) > 1 or filename in ["pytorch_model.bin", "diffusion_pytorch_model.bin"]
173
- filenames = set(s.rfilename for s in info.siblings if is_valid_filename(s.rfilename))
174
-
175
- print(filenames)
176
  with TemporaryDirectory() as d:
177
  folder = os.path.join(d, repo_folder_name(repo_id=model_id, repo_type="models"))
178
  os.makedirs(folder)
@@ -188,15 +288,23 @@ def convert(api: "HfApi", model_id: str, force: bool = False) -> Optional["Commi
188
  url = f"https://huggingface.co/{model_id}/discussions/{pr.num}"
189
  new_pr = pr
190
  raise AlreadyExists(f"Model {model_id} already has an open PR check out {url}")
 
 
 
 
 
 
 
 
191
  else:
192
- print("Convert generic")
193
- operations = convert_generic(model_id, folder, filenames)
194
 
195
  if operations:
196
  new_pr = api.create_commit(
197
  repo_id=model_id,
198
  operations=operations,
199
  commit_message=pr_title,
 
200
  create_pr=True,
201
  )
202
  print(f"Pr created at {new_pr.pr_url}")
@@ -204,7 +312,7 @@ def convert(api: "HfApi", model_id: str, force: bool = False) -> Optional["Commi
204
  print("No files to convert")
205
  finally:
206
  shutil.rmtree(folder)
207
- return new_pr
208
 
209
 
210
  if __name__ == "__main__":
@@ -225,7 +333,43 @@ if __name__ == "__main__":
225
  action="store_true",
226
  help="Create the PR even if it already exists of if the model was already converted.",
227
  )
 
 
 
 
 
228
  args = parser.parse_args()
229
  model_id = args.model_id
230
  api = HfApi()
231
- convert(api, model_id, force=args.force)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from collections import defaultdict
6
  from inspect import signature
7
  from tempfile import TemporaryDirectory
8
+ from typing import Dict, List, Optional, Set, Tuple
9
 
10
  import torch
11
 
 
13
  from huggingface_hub.file_download import repo_folder_name
14
  from safetensors.torch import load_file, save_file
15
  from transformers import AutoConfig
16
+
17
+
18
+ COMMIT_DESCRIPTION = """
19
+ This is an automated PR created with https://huggingface.co/spaces/safetensors/convert
20
+
21
+ This new file is equivalent to `pytorch_model.bin` but safe in the sense that
22
+ no arbitrary code can be put into it.
23
+
24
+ These files also happen to load much faster than their pytorch counterpart:
25
+ https://colab.research.google.com/github/huggingface/notebooks/blob/main/safetensors_doc/en/speed.ipynb
26
+
27
+ The widgets on your model page will run using this model even if this is not merged
28
+ making sure the file actually works.
29
+
30
+ If you find any issues: please report here: https://huggingface.co/spaces/safetensors/convert/discussions
31
+
32
+ Feel free to ignore this PR.
33
+ """
34
+
35
+ ConversionResult = Tuple[List["CommitOperationAdd"], List[Tuple[str, "Exception"]]]
36
 
37
 
38
  class AlreadyExists(Exception):
 
70
  return local
71
 
72
 
73
+ def convert_multi(model_id: str, folder: str, token: Optional[str]) -> ConversionResult:
74
+ filename = hf_hub_download(repo_id=model_id, filename="pytorch_model.bin.index.json", token=token, cache_dir=folder)
75
  with open(filename, "r") as f:
76
  data = json.load(f)
77
 
78
  filenames = set(data["weight_map"].values())
79
  local_filenames = []
80
  for filename in filenames:
81
+ pt_filename = hf_hub_download(repo_id=model_id, filename=filename, token=token, cache_dir=folder)
82
 
83
  sf_filename = rename(pt_filename)
84
  sf_filename = os.path.join(folder, sf_filename)
 
96
  operations = [
97
  CommitOperationAdd(path_in_repo=local.split("/")[-1], path_or_fileobj=local) for local in local_filenames
98
  ]
99
+ errors: List[Tuple[str, "Exception"]] = []
100
 
101
+ return operations, errors
102
 
103
 
104
+ def convert_single(model_id: str, folder: str, token: Optional[str]) -> ConversionResult:
105
+ pt_filename = hf_hub_download(repo_id=model_id, filename="pytorch_model.bin", token=token, cache_dir=folder)
106
 
107
  sf_name = "model.safetensors"
108
  sf_filename = os.path.join(folder, sf_name)
109
  convert_file(pt_filename, sf_filename)
110
  operations = [CommitOperationAdd(path_in_repo=sf_name, path_or_fileobj=sf_filename)]
111
+ errors: List[Tuple[str, "Exception"]] = []
112
+ return operations, errors
113
 
114
 
115
  def convert_file(
 
154
  errors.append(f"{key} : SF warnings contain {sf_only} which are not present in PT warnings")
155
  return "\n".join(errors)
156
 
157
+
158
+ def check_final_model(model_id: str, folder: str, token: Optional[str]):
159
+ config = hf_hub_download(repo_id=model_id, filename="config.json", token=token, cache_dir=folder)
160
+ shutil.copy(config, os.path.join(folder, "config.json"))
161
+ config = AutoConfig.from_pretrained(folder)
162
+
163
+ import transformers
164
+
165
+ class_ = getattr(transformers, config.architectures[0])
166
+ with torch.device("meta"):
167
+ (pt_model, pt_infos) = class_.from_pretrained(folder, output_loading_info=True)
168
+ (sf_model, sf_infos) = class_.from_pretrained(folder, output_loading_info=True)
169
+
170
+ if pt_infos != sf_infos:
171
+ error_string = create_diff(pt_infos, sf_infos)
172
+ raise ValueError(f"Different infos when reloading the model: {error_string}")
173
+
174
+ #### XXXXXXXXXXXXXXXXXXXXXXXXXXXXX
175
+ #### SKIPPING THE REST OF THE test to save RAM
176
+ return
177
+ pt_params = pt_model.state_dict()
178
+ sf_params = sf_model.state_dict()
179
+
180
+ pt_shared = shared_pointers(pt_params)
181
+ sf_shared = shared_pointers(sf_params)
182
+ if pt_shared != sf_shared:
183
+ raise RuntimeError("The reconstructed model is wrong, shared tensors are different {shared_pt} != {shared_tf}")
184
+
185
+ sig = signature(pt_model.forward)
186
+ input_ids = torch.arange(10).unsqueeze(0)
187
+ pixel_values = torch.randn(1, 3, 224, 224)
188
+ input_values = torch.arange(1000).float().unsqueeze(0)
189
+ # Hardcoded for whisper basically
190
+ input_features = torch.zeros((1, 80, 3000))
191
+ kwargs = {}
192
+ if "input_ids" in sig.parameters:
193
+ kwargs["input_ids"] = input_ids
194
+ if "input_features" in sig.parameters:
195
+ kwargs["input_features"] = input_features
196
+ if "decoder_input_ids" in sig.parameters:
197
+ kwargs["decoder_input_ids"] = input_ids
198
+ if "pixel_values" in sig.parameters:
199
+ kwargs["pixel_values"] = pixel_values
200
+ if "input_values" in sig.parameters:
201
+ kwargs["input_values"] = input_values
202
+ if "bbox" in sig.parameters:
203
+ kwargs["bbox"] = torch.zeros((1, 10, 4)).long()
204
+ if "image" in sig.parameters:
205
+ kwargs["image"] = pixel_values
206
+
207
+ if torch.cuda.is_available():
208
+ pt_model = pt_model.cuda()
209
+ sf_model = sf_model.cuda()
210
+ kwargs = {k: v.cuda() for k, v in kwargs.items()}
211
+
212
+ try:
213
+ pt_logits = pt_model(**kwargs)[0]
214
+ except Exception as e:
215
+ try:
216
+ # Musicgen special exception.
217
+ decoder_input_ids = torch.ones((input_ids.shape[0] * pt_model.decoder.num_codebooks, 1), dtype=torch.long)
218
+ if torch.cuda.is_available():
219
+ decoder_input_ids = decoder_input_ids.cuda()
220
+
221
+ kwargs["decoder_input_ids"] = decoder_input_ids
222
+ pt_logits = pt_model(**kwargs)[0]
223
+ except Exception:
224
+ raise e
225
+ sf_logits = sf_model(**kwargs)[0]
226
+
227
+ torch.testing.assert_close(sf_logits, pt_logits)
228
+ print(f"Model {model_id} is ok !")
229
+
230
+
231
  def previous_pr(api: "HfApi", model_id: str, pr_title: str) -> Optional["Discussion"]:
232
  try:
233
+ main_commit = api.list_repo_commits(model_id)[0].commit_id
234
  discussions = api.get_repo_discussions(repo_id=model_id)
235
  except Exception:
236
  return None
237
  for discussion in discussions:
238
  if discussion.status == "open" and discussion.is_pull_request and discussion.title == pr_title:
239
+ commits = api.list_repo_commits(model_id, revision=discussion.git_reference)
240
 
241
+ if main_commit == commits[1].commit_id:
242
+ return discussion
243
+ return None
244
 
245
+
246
+ def convert_generic(model_id: str, folder: str, filenames: Set[str], token: Optional[str]) -> ConversionResult:
247
  operations = []
248
+ errors = []
249
 
250
  extensions = set([".bin", ".ckpt"])
251
  for filename in filenames:
252
  prefix, ext = os.path.splitext(filename)
253
  if ext in extensions:
254
+ pt_filename = hf_hub_download(model_id, filename=filename, token=token, cache_dir=folder)
255
  dirname, raw_filename = os.path.split(filename)
256
  if raw_filename == "pytorch_model.bin":
257
  # XXX: This is a special case to handle `transformers` and the
 
260
  else:
261
  sf_in_repo = f"{prefix}.safetensors"
262
  sf_filename = os.path.join(folder, sf_in_repo)
263
+ try:
264
+ convert_file(pt_filename, sf_filename)
265
+ operations.append(CommitOperationAdd(path_in_repo=sf_in_repo, path_or_fileobj=sf_filename))
266
+ except Exception as e:
267
+ errors.append((pt_filename, e))
268
+ return operations, errors
269
 
270
 
271
+ def convert(api: "HfApi", model_id: str, force: bool = False) -> Tuple["CommitInfo", List[Tuple[str, "Exception"]]]:
272
  pr_title = "Adding `safetensors` variant of this model"
273
  info = api.model_info(model_id)
274
+ filenames = set(s.rfilename for s in info.siblings)
275
 
 
 
 
 
 
276
  with TemporaryDirectory() as d:
277
  folder = os.path.join(d, repo_folder_name(repo_id=model_id, repo_type="models"))
278
  os.makedirs(folder)
 
288
  url = f"https://huggingface.co/{model_id}/discussions/{pr.num}"
289
  new_pr = pr
290
  raise AlreadyExists(f"Model {model_id} already has an open PR check out {url}")
291
+ elif library_name == "transformers":
292
+ if "pytorch_model.bin" in filenames:
293
+ operations, errors = convert_single(model_id, folder, token=api.token)
294
+ elif "pytorch_model.bin.index.json" in filenames:
295
+ operations, errors = convert_multi(model_id, folder, token=api.token)
296
+ else:
297
+ raise RuntimeError(f"Model {model_id} doesn't seem to be a valid pytorch model. Cannot convert")
298
+ check_final_model(model_id, folder, token=api.token)
299
  else:
300
+ operations, errors = convert_generic(model_id, folder, filenames, token=api.token)
 
301
 
302
  if operations:
303
  new_pr = api.create_commit(
304
  repo_id=model_id,
305
  operations=operations,
306
  commit_message=pr_title,
307
+ commit_description=COMMIT_DESCRIPTION,
308
  create_pr=True,
309
  )
310
  print(f"Pr created at {new_pr.pr_url}")
 
312
  print("No files to convert")
313
  finally:
314
  shutil.rmtree(folder)
315
+ return new_pr, errors
316
 
317
 
318
  if __name__ == "__main__":
 
333
  action="store_true",
334
  help="Create the PR even if it already exists of if the model was already converted.",
335
  )
336
+ parser.add_argument(
337
+ "-y",
338
+ action="store_true",
339
+ help="Ignore safety prompt",
340
+ )
341
  args = parser.parse_args()
342
  model_id = args.model_id
343
  api = HfApi()
344
+ if args.y:
345
+ txt = "y"
346
+ else:
347
+ txt = input(
348
+ "This conversion script will unpickle a pickled file, which is inherently unsafe. If you do not trust this file, we invite you to use"
349
+ " https://huggingface.co/spaces/safetensors/convert or google colab or other hosted solution to avoid potential issues with this file."
350
+ " Continue [Y/n] ?"
351
+ )
352
+ if txt.lower() in {"", "y"}:
353
+ try:
354
+ commit_info, errors = convert(api, model_id, force=args.force)
355
+ string = f"""
356
+ ### Success 🔥
357
+ Yay! This model was successfully converted and a PR was open using your token, here:
358
+ [{commit_info.pr_url}]({commit_info.pr_url})
359
+ """
360
+ if errors:
361
+ string += "\nErrors during conversion:\n"
362
+ string += "\n".join(
363
+ f"Error while converting {filename}: {e}, skipped conversion" for filename, e in errors
364
+ )
365
+ print(string)
366
+ except Exception as e:
367
+ print(
368
+ f"""
369
+ ### Error 😢😢😢
370
+
371
+ {e}
372
+ """
373
+ )
374
+ else:
375
+ print(f"Answer was `{txt}` aborting.")
requirements.txt CHANGED
@@ -1,6 +1,6 @@
1
  huggingface_hub
2
  setuptools_rust
3
- safetensors
4
  torch==1.13.1
5
  transformers
6
  pytorch_lightning
 
1
  huggingface_hub
2
  setuptools_rust
3
+ safetensors>=0.3
4
  torch==1.13.1
5
  transformers
6
  pytorch_lightning