Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
from transformers import AutoModelForCausalLM, AutoTokenizer | |
from peft import PeftModel | |
import gc | |
import gradio as gr | |
import torch | |
from huggingface_hub import snapshot_download, HfApi, notebook_login, create_repo, whoami, login | |
api = HfApi() | |
def info_fn(text): | |
gr.Info(text) | |
def warning_fn(text): | |
gr.Warning(text) | |
def upload(hf_token, base_model_name_or_path, peft_model_path, output_dir): | |
try: | |
login(hf_token) | |
repo_name = output_dir | |
device_arg = {'device_map': "cpu"} | |
info_fn(f"Loading base model: {base_model_name_or_path}") | |
base_model = AutoModelForCausalLM.from_pretrained(base_model_name_or_path, torch_dtype=torch.bfloat16, **device_arg) | |
info_fn(f"Loading PEFT: {peft_model_path}") | |
model = PeftModel.from_pretrained(base_model, peft_model_path, **device_arg) | |
info_fn(f"Running merge_and_unload") | |
model = model.merge_and_unload() | |
tokenizer = AutoTokenizer.from_pretrained(base_model_name_or_path) | |
info_fn("Saving model..") | |
model.save_pretrained(output_dir, safe_serialization=True) | |
info_fn("Saving tokenizer...") | |
tokenizer.save_pretrained(output_dir) | |
info_fn(f"Model saved to {output_dir}") | |
del model | |
gc.collect() | |
try: | |
info_fn("Creating Repo...") | |
info_fn(api.create_repo(repo_id=repo_name).__dict__['url']) | |
except Exception as e: | |
warning_fn(f"Model already exists: {e}") | |
info_fn("Uploading to hub...") | |
uploading = api.upload_folder( | |
folder_path=output_dir, | |
repo_id=output_dir, | |
repo_type="model") | |
return uploading | |
except Exception as e: | |
gc.collect() | |
gr.Error(e) | |
return e | |
INTRODUCTION_TEXT = f""" | |
🎯 This space allows you to merge your Lora adapters. | |
## ❓ What is Lora? | |
LoRA: Low-Rank Adaptation of Large Language Models allows you to train LLM's with a low cost. Lora freezes the pre-trained model weights and injects trainable rank decomposition matrices into each layer of the Transformer architecture, greatly reducing the number of trainable parameters for downstream tasks. | |
You can learn more about LoRa here: | |
[📝 LoRA: Low-Rank Adaptation of Large Language Models Arxiv](https://arxiv.org/abs/2106.09685) | |
## 🛠️ How does this space work? | |
🛠️ The leaderboard's backend mainly runs the transformers and PEFT library. | |
🤖 The code first loads your original model and then your adapter models. | |
📚 The code merges your adapter weights using the `merge_and_unload` function from the PEFT library. | |
📤 The code saves your resulting model temporarily and then pushes the resulting model to the hub. | |
## 🧮 Required RAM | |
This space is loading the model to RAM without performing any quantization, so the required RAM is high. | |
You can merge models up to 7B. (If your adapter weights are too large, it might not work.) | |
""" | |
with gr.Blocks() as demo: | |
gr.Markdown("""<h1 align="center" id="space-title">🚀 Lora Merge</h1>""") | |
gr.Markdown(INTRODUCTION_TEXT) | |
with gr.Row(): | |
with gr.Column(scale=1): | |
hf_token = gr.Textbox(label="Huggingface Write Access Token") | |
base_model_name_or_path = gr.Textbox(label="Base Model") | |
peft_model_path = gr.Textbox(label="Adapter Model") | |
output_dir = gr.Textbox(label="Output Model Name") | |
with gr.Column(scale=1): | |
text = gr.Textbox(label="Output Model Name", lines=14) | |
submit = gr.Button("Merge lora with adapters") | |
submit.click(fn=upload, inputs=[hf_token, base_model_name_or_path, peft_model_path, output_dir], outputs=text) | |
demo.queue() | |
demo.launch(show_error=True) |