SVDiff-Training-UI / trainer.py
mkshing
first commit
e2a20af
raw
history blame
6.27 kB
from __future__ import annotations
import datetime
import os
import pathlib
import shlex
import shutil
import subprocess
import gradio as gr
import PIL.Image
import slugify
import torch
from huggingface_hub import HfApi
from accelerate.utils import write_basic_config
from app_upload import ModelUploader
from utils import save_model_card
URL_TO_JOIN_LIBRARY_ORG = 'https://huggingface.co./organizations/svdiff-library/share/PZBRRkosXikenXUdjMcvcoFmpWjcWnZjKL'
def pad_image(image: PIL.Image.Image) -> PIL.Image.Image:
w, h = image.size
if w == h:
return image
elif w > h:
new_image = PIL.Image.new(image.mode, (w, w), (0, 0, 0))
new_image.paste(image, (0, (w - h) // 2))
return new_image
else:
new_image = PIL.Image.new(image.mode, (h, h), (0, 0, 0))
new_image.paste(image, ((h - w) // 2, 0))
return new_image
class Trainer:
def __init__(self, hf_token: str | None = None):
self.hf_token = hf_token
self.api = HfApi(token=hf_token)
self.model_uploader = ModelUploader(hf_token)
def prepare_dataset(self, instance_images: list, resolution: int,
instance_data_dir: pathlib.Path) -> None:
shutil.rmtree(instance_data_dir, ignore_errors=True)
instance_data_dir.mkdir(parents=True)
for i, temp_path in enumerate(instance_images):
image = PIL.Image.open(temp_path.name)
image = pad_image(image)
image = image.resize((resolution, resolution))
image = image.convert('RGB')
out_path = instance_data_dir / f'{i:03d}.jpg'
image.save(out_path, format='JPEG', quality=100)
def join_library_org(self) -> None:
subprocess.run(
shlex.split(
f'curl -X POST -H "Authorization: Bearer {self.hf_token}" -H "Content-Type: application/json" {URL_TO_JOIN_LIBRARY_ORG}'
))
def run(
self,
instance_images: list | None,
instance_prompt: str,
output_model_name: str,
overwrite_existing_model: bool,
validation_prompt: str,
base_model: str,
resolution_s: str,
n_steps: int,
learning_rate: float,
gradient_accumulation: int,
seed: int,
fp16: bool,
use_8bit_adam: bool,
gradient_checkpointing: bool,
# enable_xformers_memory_efficient_attention: bool,
checkpointing_steps: int,
use_wandb: bool,
validation_epochs: int,
upload_to_hub: bool,
use_private_repo: bool,
delete_existing_repo: bool,
upload_to: str,
remove_gpu_after_training: bool,
) -> str:
if not torch.cuda.is_available():
raise gr.Error('CUDA is not available.')
if instance_images is None:
raise gr.Error('You need to upload images.')
if not instance_prompt:
raise gr.Error('The instance prompt is missing.')
if not validation_prompt:
raise gr.Error('The validation prompt is missing.')
resolution = int(resolution_s)
if not output_model_name:
timestamp = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
output_model_name = f'svdiff-pytorch-{timestamp}'
output_model_name = slugify.slugify(output_model_name)
repo_dir = pathlib.Path(__file__).parent
output_dir = repo_dir / 'experiments' / output_model_name
if overwrite_existing_model or upload_to_hub:
shutil.rmtree(output_dir, ignore_errors=True)
output_dir.mkdir(parents=True)
instance_data_dir = repo_dir / 'training_data' / output_model_name
self.prepare_dataset(instance_images, resolution, instance_data_dir)
if upload_to_hub:
self.join_library_org()
# accelerate config
write_basic_config()
command = f'''
accelerate launch train_svdiff.py \
--pretrained_model_name_or_path={base_model} \
--instance_data_dir={instance_data_dir} \
--output_dir={output_dir} \
--instance_prompt="{instance_prompt}" \
--resolution={resolution} \
--train_batch_size=1 \
--gradient_accumulation_steps={gradient_accumulation} \
--learning_rate={learning_rate} \
--lr_scheduler=constant \
--lr_warmup_steps=0 \
--max_train_steps={n_steps} \
--checkpointing_steps={checkpointing_steps} \
--validation_prompt="{validation_prompt}" \
--validation_epochs={validation_epochs} \
--seed={seed}
'''
if fp16:
command += ' --mixed_precision="fp16"'
if use_8bit_adam:
command += ' --use_8bit_adam'
if gradient_checkpointing:
command += ' --gradient_checkpointing'
# if enable_xformers_memory_efficient_attention:
# command += ' --enable_xformers_memory_efficient_attention'
if use_wandb:
command += ' --report_to wandb'
with open(output_dir / 'train.sh', 'w') as f:
command_s = ' '.join(command.split())
f.write(command_s)
subprocess.run(shlex.split(command))
save_model_card(save_dir=output_dir,
base_model=base_model,
instance_prompt=instance_prompt,
test_prompt=validation_prompt,
test_image_dir='test_images')
message = 'Training completed!'
print(message)
if upload_to_hub:
upload_message = self.model_uploader.upload_model(
folder_path=output_dir.as_posix(),
repo_name=output_model_name,
upload_to=upload_to,
private=use_private_repo,
delete_existing_repo=delete_existing_repo)
print(upload_message)
message = message + '\n' + upload_message
if remove_gpu_after_training:
space_id = os.getenv('SPACE_ID')
if space_id:
self.api.request_space_hardware(repo_id=space_id,
hardware='cpu-basic')
return message