Spaces:

hdeldar
/

convert

Runtime error

App Files Files Community

hdeldar commited on Sep 24, 2023

Commit

431304e

•

1 Parent(s): 76abd46

change code versions

Browse files

Files changed (6) hide show

README.md +2 -3
__pycache__/app.cpython-310.pyc +0 -0
__pycache__/convert.cpython-310.pyc +0 -0
app.py +12 -6
convert.py +169 -25
requirements.txt +1 -1

README.md CHANGED Viewed

@@ -4,14 +4,13 @@ emoji: 🐶
 colorFrom: yellow
 colorTo: red
 sdk: gradio
-sdk_version: 3.8.1
 app_file: app.py
-pinned: false
 license: apache-2.0
 models: []
 datasets:
 - safetensors/conversions
-duplicated_from: safetensors/convert
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 colorFrom: yellow
 colorTo: red
 sdk: gradio
+sdk_version: 3.36.1
 app_file: app.py
+pinned: true
 license: apache-2.0
 models: []
 datasets:
 - safetensors/conversions
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

__pycache__/app.cpython-310.pyc DELETED Viewed

Binary file (2.63 kB)

__pycache__/convert.cpython-310.pyc DELETED Viewed

Binary file (8.15 kB)

app.py CHANGED Viewed

@@ -15,7 +15,8 @@ DATA_FILE = os.path.join("data", DATA_FILENAME)
 HF_TOKEN = os.environ.get("HF_TOKEN")
 repo: Optional[Repository] = None
-if HF_TOKEN:
     repo = Repository(local_dir="data", clone_from=DATASET_REPO_URL, token=HF_TOKEN)
@@ -31,11 +32,12 @@ def run(token: str, model_id: str) -> str:
         is_private = api.model_info(repo_id=model_id).private
         print("is_private", is_private)
-        commit_info = convert(api=api, model_id=model_id, force=True)
         print("[commit_info]", commit_info)
         # save in a (public) dataset:
-        if repo is not None and not is_private:
             repo.git_pull(rebase=True)
             print("pulled")
             with open(DATA_FILE, "a") as csvfile:
@@ -52,13 +54,17 @@ def run(token: str, model_id: str) -> str:
             commit_url = repo.push_to_hub()
             print("[dataset]", commit_url)
-        return f"""
         ### Success 🔥
         Yay! This model was successfully converted and a PR was open using your token, here:
         [{commit_info.pr_url}]({commit_info.pr_url})
         """
     except Exception as e:
         return f"""
         ### Error 😢😢😢
@@ -89,6 +95,6 @@ demo = gr.Interface(
     ],
     outputs=[gr.Markdown(label="output")],
     fn=run,
-)
-demo.launch()

 HF_TOKEN = os.environ.get("HF_TOKEN")
 repo: Optional[Repository] = None
+# TODO
+if False and HF_TOKEN:
     repo = Repository(local_dir="data", clone_from=DATASET_REPO_URL, token=HF_TOKEN)
         is_private = api.model_info(repo_id=model_id).private
         print("is_private", is_private)
+        commit_info, errors = convert(api=api, model_id=model_id)
         print("[commit_info]", commit_info)
         # save in a (public) dataset:
+        # TODO False because of LFS bug.
+        if False and repo is not None and not is_private:
             repo.git_pull(rebase=True)
             print("pulled")
             with open(DATA_FILE, "a") as csvfile:
             commit_url = repo.push_to_hub()
             print("[dataset]", commit_url)
+        string =  f"""
         ### Success 🔥
         Yay! This model was successfully converted and a PR was open using your token, here:
         [{commit_info.pr_url}]({commit_info.pr_url})
         """
+        if errors:
+            string += "\nErrors during conversion:\n"
+            string += "\n".join(f"Error while converting {filename}: {e}, skipped conversion" for filename, e in errors)
+        return string
     except Exception as e:
         return f"""
         ### Error 😢😢😢
     ],
     outputs=[gr.Markdown(label="output")],
     fn=run,
+).queue(max_size=10, concurrency_count=1)
+demo.launch(show_api=True)

convert.py CHANGED Viewed

@@ -5,7 +5,7 @@ import shutil
 from collections import defaultdict
 from inspect import signature
 from tempfile import TemporaryDirectory
-from typing import Dict, List, Optional, Set
 import torch
@@ -13,7 +13,26 @@ from huggingface_hub import CommitInfo, CommitOperationAdd, Discussion, HfApi, h
 from huggingface_hub.file_download import repo_folder_name
 from safetensors.torch import load_file, save_file
 from transformers import AutoConfig
-from transformers.pipelines.base import infer_framework_load_model
 class AlreadyExists(Exception):
@@ -51,15 +70,15 @@ def rename(pt_filename: str) -> str:
     return local
-def convert_multi(model_id: str, folder: str) -> List["CommitOperationAdd"]:
-    filename = hf_hub_download(repo_id=model_id, filename="pytorch_model.bin.index.json")
     with open(filename, "r") as f:
         data = json.load(f)
     filenames = set(data["weight_map"].values())
     local_filenames = []
     for filename in filenames:
-        pt_filename = hf_hub_download(repo_id=model_id, filename=filename)
         sf_filename = rename(pt_filename)
         sf_filename = os.path.join(folder, sf_filename)
@@ -77,18 +96,20 @@ def convert_multi(model_id: str, folder: str) -> List["CommitOperationAdd"]:
     operations = [
         CommitOperationAdd(path_in_repo=local.split("/")[-1], path_or_fileobj=local) for local in local_filenames
     ]
-    return operations
-def convert_single(model_id: str, folder: str) -> List["CommitOperationAdd"]:
-    pt_filename = hf_hub_download(repo_id=model_id, filename="pytorch_model.bin")
     sf_name = "model.safetensors"
     sf_filename = os.path.join(folder, sf_name)
     convert_file(pt_filename, sf_filename)
     operations = [CommitOperationAdd(path_in_repo=sf_name, path_or_fileobj=sf_filename)]
-    return operations
 def convert_file(
@@ -133,24 +154,104 @@ def create_diff(pt_infos: Dict[str, List[str]], sf_infos: Dict[str, List[str]])
             errors.append(f"{key} : SF warnings contain {sf_only} which are not present in PT warnings")
     return "\n".join(errors)
 def previous_pr(api: "HfApi", model_id: str, pr_title: str) -> Optional["Discussion"]:
     try:
         discussions = api.get_repo_discussions(repo_id=model_id)
     except Exception:
         return None
     for discussion in discussions:
         if discussion.status == "open" and discussion.is_pull_request and discussion.title == pr_title:
-            return discussion
-def convert_generic(model_id: str, folder: str, filenames: Set[str]) -> List["CommitOperationAdd"]:
     operations = []
     extensions = set([".bin", ".ckpt"])
     for filename in filenames:
         prefix, ext = os.path.splitext(filename)
         if ext in extensions:
-            pt_filename = hf_hub_download(model_id, filename=filename)
             dirname, raw_filename = os.path.split(filename)
             if raw_filename == "pytorch_model.bin":
                 # XXX: This is a special case to handle `transformers` and the
@@ -159,20 +260,19 @@ def convert_generic(model_id: str, folder: str, filenames: Set[str]) -> List["Co
             else:
                 sf_in_repo = f"{prefix}.safetensors"
             sf_filename = os.path.join(folder, sf_in_repo)
-            convert_file(pt_filename, sf_filename)
-            operations.append(CommitOperationAdd(path_in_repo=sf_in_repo, path_or_fileobj=sf_filename))
-    return operations
-def convert(api: "HfApi", model_id: str, force: bool = False) -> Optional["CommitInfo"]:
     pr_title = "Adding `safetensors` variant of this model"
     info = api.model_info(model_id)
-    def is_valid_filename(filename):
-        return len(filename.split("/")) > 1 or filename in ["pytorch_model.bin", "diffusion_pytorch_model.bin"]
-    filenames = set(s.rfilename for s in info.siblings if is_valid_filename(s.rfilename))
-    print(filenames)
     with TemporaryDirectory() as d:
         folder = os.path.join(d, repo_folder_name(repo_id=model_id, repo_type="models"))
         os.makedirs(folder)
@@ -188,15 +288,23 @@ def convert(api: "HfApi", model_id: str, force: bool = False) -> Optional["Commi
                 url = f"https://huggingface.co/{model_id}/discussions/{pr.num}"
                 new_pr = pr
                 raise AlreadyExists(f"Model {model_id} already has an open PR check out {url}")
             else:
-                print("Convert generic")
-                operations = convert_generic(model_id, folder, filenames)
             if operations:
                 new_pr = api.create_commit(
                     repo_id=model_id,
                     operations=operations,
                     commit_message=pr_title,
                     create_pr=True,
                 )
                 print(f"Pr created at {new_pr.pr_url}")
@@ -204,7 +312,7 @@ def convert(api: "HfApi", model_id: str, force: bool = False) -> Optional["Commi
                 print("No files to convert")
         finally:
             shutil.rmtree(folder)
-        return new_pr
 if __name__ == "__main__":
@@ -225,7 +333,43 @@ if __name__ == "__main__":
         action="store_true",
         help="Create the PR even if it already exists of if the model was already converted.",
     )
     args = parser.parse_args()
     model_id = args.model_id
     api = HfApi()
-    convert(api, model_id, force=args.force)

 from collections import defaultdict
 from inspect import signature
 from tempfile import TemporaryDirectory
+from typing import Dict, List, Optional, Set, Tuple
 import torch
 from huggingface_hub.file_download import repo_folder_name
 from safetensors.torch import load_file, save_file
 from transformers import AutoConfig
+COMMIT_DESCRIPTION = """
+This is an automated PR created with https://huggingface.co/spaces/safetensors/convert
+This new file is equivalent to `pytorch_model.bin` but safe in the sense that
+no arbitrary code can be put into it.
+These files also happen to load much faster than their pytorch counterpart:
+https://colab.research.google.com/github/huggingface/notebooks/blob/main/safetensors_doc/en/speed.ipynb
+The widgets on your model page will run using this model even if this is not merged
+making sure the file actually works.
+If you find any issues: please report here: https://huggingface.co/spaces/safetensors/convert/discussions
+Feel free to ignore this PR.
+"""
+ConversionResult = Tuple[List["CommitOperationAdd"], List[Tuple[str, "Exception"]]]
 class AlreadyExists(Exception):
     return local
+def convert_multi(model_id: str, folder: str, token: Optional[str]) -> ConversionResult:
+    filename = hf_hub_download(repo_id=model_id, filename="pytorch_model.bin.index.json", token=token, cache_dir=folder)
     with open(filename, "r") as f:
         data = json.load(f)
     filenames = set(data["weight_map"].values())
     local_filenames = []
     for filename in filenames:
+        pt_filename = hf_hub_download(repo_id=model_id, filename=filename, token=token, cache_dir=folder)
         sf_filename = rename(pt_filename)
         sf_filename = os.path.join(folder, sf_filename)
     operations = [
         CommitOperationAdd(path_in_repo=local.split("/")[-1], path_or_fileobj=local) for local in local_filenames
     ]
+    errors: List[Tuple[str, "Exception"]] = []
+    return operations, errors
+def convert_single(model_id: str, folder: str, token: Optional[str]) -> ConversionResult:
+    pt_filename = hf_hub_download(repo_id=model_id, filename="pytorch_model.bin", token=token, cache_dir=folder)
     sf_name = "model.safetensors"
     sf_filename = os.path.join(folder, sf_name)
     convert_file(pt_filename, sf_filename)
     operations = [CommitOperationAdd(path_in_repo=sf_name, path_or_fileobj=sf_filename)]
+    errors: List[Tuple[str, "Exception"]] = []
+    return operations, errors
 def convert_file(
             errors.append(f"{key} : SF warnings contain {sf_only} which are not present in PT warnings")
     return "\n".join(errors)
+def check_final_model(model_id: str, folder: str, token: Optional[str]):
+    config = hf_hub_download(repo_id=model_id, filename="config.json", token=token, cache_dir=folder)
+    shutil.copy(config, os.path.join(folder, "config.json"))
+    config = AutoConfig.from_pretrained(folder)
+    import transformers
+    class_ = getattr(transformers, config.architectures[0])
+    with torch.device("meta"):
+        (pt_model, pt_infos) = class_.from_pretrained(folder, output_loading_info=True)
+        (sf_model, sf_infos) = class_.from_pretrained(folder, output_loading_info=True)
+        if pt_infos != sf_infos:
+            error_string = create_diff(pt_infos, sf_infos)
+            raise ValueError(f"Different infos when reloading the model: {error_string}")
+    #### XXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+    ####  SKIPPING THE REST OF THE test to save RAM
+    return
+    pt_params = pt_model.state_dict()
+    sf_params = sf_model.state_dict()
+    pt_shared = shared_pointers(pt_params)
+    sf_shared = shared_pointers(sf_params)
+    if pt_shared != sf_shared:
+        raise RuntimeError("The reconstructed model is wrong, shared tensors are different {shared_pt} != {shared_tf}")
+    sig = signature(pt_model.forward)
+    input_ids = torch.arange(10).unsqueeze(0)
+    pixel_values = torch.randn(1, 3, 224, 224)
+    input_values = torch.arange(1000).float().unsqueeze(0)
+    # Hardcoded for whisper basically
+    input_features = torch.zeros((1, 80, 3000))
+    kwargs = {}
+    if "input_ids" in sig.parameters:
+        kwargs["input_ids"] = input_ids
+    if "input_features" in sig.parameters:
+        kwargs["input_features"] = input_features
+    if "decoder_input_ids" in sig.parameters:
+        kwargs["decoder_input_ids"] = input_ids
+    if "pixel_values" in sig.parameters:
+        kwargs["pixel_values"] = pixel_values
+    if "input_values" in sig.parameters:
+        kwargs["input_values"] = input_values
+    if "bbox" in sig.parameters:
+        kwargs["bbox"] = torch.zeros((1, 10, 4)).long()
+    if "image" in sig.parameters:
+        kwargs["image"] = pixel_values
+    if torch.cuda.is_available():
+        pt_model = pt_model.cuda()
+        sf_model = sf_model.cuda()
+        kwargs = {k: v.cuda() for k, v in kwargs.items()}
+    try:
+        pt_logits = pt_model(**kwargs)[0]
+    except Exception as e:
+        try:
+            # Musicgen special exception.
+            decoder_input_ids = torch.ones((input_ids.shape[0] * pt_model.decoder.num_codebooks, 1), dtype=torch.long)
+            if torch.cuda.is_available():
+                decoder_input_ids = decoder_input_ids.cuda()
+            kwargs["decoder_input_ids"] = decoder_input_ids
+            pt_logits = pt_model(**kwargs)[0]
+        except Exception:
+            raise e
+    sf_logits = sf_model(**kwargs)[0]
+    torch.testing.assert_close(sf_logits, pt_logits)
+    print(f"Model {model_id} is ok !")
 def previous_pr(api: "HfApi", model_id: str, pr_title: str) -> Optional["Discussion"]:
     try:
+        main_commit = api.list_repo_commits(model_id)[0].commit_id
         discussions = api.get_repo_discussions(repo_id=model_id)
     except Exception:
         return None
     for discussion in discussions:
         if discussion.status == "open" and discussion.is_pull_request and discussion.title == pr_title:
+            commits = api.list_repo_commits(model_id, revision=discussion.git_reference)
+            if main_commit == commits[1].commit_id:
+                return discussion
+    return None
+def convert_generic(model_id: str, folder: str, filenames: Set[str], token: Optional[str]) -> ConversionResult:
     operations = []
+    errors = []
     extensions = set([".bin", ".ckpt"])
     for filename in filenames:
         prefix, ext = os.path.splitext(filename)
         if ext in extensions:
+            pt_filename = hf_hub_download(model_id, filename=filename, token=token, cache_dir=folder)
             dirname, raw_filename = os.path.split(filename)
             if raw_filename == "pytorch_model.bin":
                 # XXX: This is a special case to handle `transformers` and the
             else:
                 sf_in_repo = f"{prefix}.safetensors"
             sf_filename = os.path.join(folder, sf_in_repo)
+            try:
+                convert_file(pt_filename, sf_filename)
+                operations.append(CommitOperationAdd(path_in_repo=sf_in_repo, path_or_fileobj=sf_filename))
+            except Exception as e:
+                errors.append((pt_filename, e))
+    return operations, errors
+def convert(api: "HfApi", model_id: str, force: bool = False) -> Tuple["CommitInfo", List[Tuple[str, "Exception"]]]:
     pr_title = "Adding `safetensors` variant of this model"
     info = api.model_info(model_id)
+    filenames = set(s.rfilename for s in info.siblings)
     with TemporaryDirectory() as d:
         folder = os.path.join(d, repo_folder_name(repo_id=model_id, repo_type="models"))
         os.makedirs(folder)
                 url = f"https://huggingface.co/{model_id}/discussions/{pr.num}"
                 new_pr = pr
                 raise AlreadyExists(f"Model {model_id} already has an open PR check out {url}")
+            elif library_name == "transformers":
+                if "pytorch_model.bin" in filenames:
+                    operations, errors = convert_single(model_id, folder, token=api.token)
+                elif "pytorch_model.bin.index.json" in filenames:
+                    operations, errors = convert_multi(model_id, folder, token=api.token)
+                else:
+                    raise RuntimeError(f"Model {model_id} doesn't seem to be a valid pytorch model. Cannot convert")
+                check_final_model(model_id, folder, token=api.token)
             else:
+                operations, errors = convert_generic(model_id, folder, filenames, token=api.token)
             if operations:
                 new_pr = api.create_commit(
                     repo_id=model_id,
                     operations=operations,
                     commit_message=pr_title,
+                    commit_description=COMMIT_DESCRIPTION,
                     create_pr=True,
                 )
                 print(f"Pr created at {new_pr.pr_url}")
                 print("No files to convert")
         finally:
             shutil.rmtree(folder)
+        return new_pr, errors
 if __name__ == "__main__":
         action="store_true",
         help="Create the PR even if it already exists of if the model was already converted.",
     )
+    parser.add_argument(
+        "-y",
+        action="store_true",
+        help="Ignore safety prompt",
+    )
     args = parser.parse_args()
     model_id = args.model_id
     api = HfApi()
+    if args.y:
+        txt = "y"
+    else:
+        txt = input(
+            "This conversion script will unpickle a pickled file, which is inherently unsafe. If you do not trust this file, we invite you to use"
+            " https://huggingface.co/spaces/safetensors/convert or google colab or other hosted solution to avoid potential issues with this file."
+            " Continue [Y/n] ?"
+        )
+    if txt.lower() in {"", "y"}:
+        try:
+            commit_info, errors = convert(api, model_id, force=args.force)
+            string = f"""
+### Success 🔥
+Yay! This model was successfully converted and a PR was open using your token, here:
+[{commit_info.pr_url}]({commit_info.pr_url})
+            """
+            if errors:
+                string += "\nErrors during conversion:\n"
+                string += "\n".join(
+                    f"Error while converting {filename}: {e}, skipped conversion" for filename, e in errors
+                )
+            print(string)
+        except Exception as e:
+            print(
+                f"""
+### Error 😢😢😢
+{e}
+            """
+            )
+    else:
+        print(f"Answer was `{txt}` aborting.")

requirements.txt CHANGED Viewed

@@ -1,6 +1,6 @@
 huggingface_hub
 setuptools_rust
-safetensors
 torch==1.13.1
 transformers
 pytorch_lightning

 huggingface_hub
 setuptools_rust
+safetensors>=0.3
 torch==1.13.1
 transformers
 pytorch_lightning