Spaces:

Sanshruth
/

VQGAN_CLIP

Sleeping

App Files Files Community

VQGAN_CLIP / app.py

Sanshruth

Update app.py

3d30297 verified 3 months ago

raw

history blame

11.2 kB

	import zipfile
	def unzip_content():
	try:
	# First try using Python's zipfile
	print("Attempting to unzip content using Python...")
	with zipfile.ZipFile('./content.zip', 'r') as zip_ref:
	zip_ref.extractall('.')
	except Exception as e:
	print(f"Python unzip failed: {str(e)}")
	try:
	# Fallback to system unzip command
	print("Attempting to unzip content using system command...")
	subprocess.run(['unzip', '-o', './content.zip'], check=True)
	except Exception as e:
	print(f"System unzip failed: {str(e)}")
	raise Exception("Failed to unzip content using both methods")
	print("Content successfully unzipped!")

	# Try to unzip content at startup
	try:
	unzip_content()
	except Exception as e:
	print(f"Warning: Could not unzip content: {str(e)}")

	import gradio as gr
	import numpy as np
	import torch
	import torchvision
	import torchvision.transforms
	import torchvision.transforms.functional
	import PIL
	import matplotlib.pyplot as plt
	import yaml
	from omegaconf import OmegaConf
	from CLIP import clip
	import os
	import sys

	#os.chdir('./taming-transformers')
	#from taming.models.vqgan import VQModel
	#os.chdir('..')
	taming_path = os.path.join(os.getcwd(), 'taming-transformers')
	sys.path.append(taming_path)
	from taming.models.vqgan import VQModel


	from PIL import Image
	import cv2
	import imageio

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	def create_video(image_folder='./generated', video_name='morphing_video.mp4'):
	images = sorted([img for img in os.listdir(image_folder) if img.endswith(".png") or img.endswith(".jpg")])
	if len(images) == 0:
	print("No images found in the folder.")
	return None

	frame = cv2.imread(os.path.join(image_folder, images[0]))
	height, width, layers = frame.shape
	video_writer = imageio.get_writer(video_name, fps=10)

	for image in images:
	img_path = os.path.join(image_folder, image)
	img = imageio.imread(img_path)
	video_writer.append_data(img)

	video_writer.close()
	return video_name

	def save_from_tensors(tensor, output_dir, filename):
	img = tensor.clone()
	img = img.mul(255).byte()
	img = img.cpu().numpy().transpose((1, 2, 0))
	os.makedirs(output_dir, exist_ok=True)
	Image.fromarray(img).save(os.path.join(output_dir, filename))

	def norm_data(data):
	return (data.clip(-1, 1) + 1) / 2

	def setup_clip_model():
	model, _ = clip.load('ViT-B/32', jit=False)
	model.eval().to(device)
	return model

	def setup_vqgan_model(config_path, checkpoint_path):
	config = OmegaConf.load(config_path)
	model = VQModel(**config.model.params)
	state_dict = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
	model.load_state_dict(state_dict, strict=False)
	return model.eval().to(device)

	def generator(x, model):
	x = model.post_quant_conv(x)
	x = model.decoder(x)
	return x

	def encode_text(text, clip_model):
	t = clip.tokenize(text).to(device)
	return clip_model.encode_text(t).detach().clone()

	def create_encoding(include, exclude, extras, clip_model):
	include_enc = [encode_text(text, clip_model) for text in include]
	exclude_enc = [encode_text(text, clip_model) for text in exclude]
	extras_enc = [encode_text(text, clip_model) for text in extras]
	return include_enc, exclude_enc, extras_enc

	def create_crops(img, num_crops=32, size1=225, noise_factor=0.05):
	aug_transform = torch.nn.Sequential(
	torchvision.transforms.RandomHorizontalFlip(),
	torchvision.transforms.RandomAffine(30, translate=(0.1, 0.1), fill=0)
	).to(device)

	p = size1 // 2
	img = torch.nn.functional.pad(img, (p, p, p, p), mode='constant', value=0)
	img = aug_transform(img)

	crop_set = []
	for _ in range(num_crops):
	gap1 = int(torch.normal(1.2, .3, ()).clip(.43, 1.9) * size1)
	offsetx = torch.randint(0, int(size1 * 2 - gap1), ())
	offsety = torch.randint(0, int(size1 * 2 - gap1), ())
	crop = img[:, :, offsetx:offsetx + gap1, offsety:offsety + gap1]
	crop = torch.nn.functional.interpolate(crop, (224, 224), mode='bilinear', align_corners=True)
	crop_set.append(crop)

	img_crops = torch.cat(crop_set, 0)
	randnormal = torch.randn_like(img_crops, requires_grad=False)
	randstotal = torch.rand((img_crops.shape[0], 1, 1, 1)).to(device)
	img_crops = img_crops + noise_factor * randstotal * randnormal

	return img_crops

	def optimize_result(params, prompt, vqgan_model, clip_model, w1, w2, extras_enc, exclude_enc):
	alpha = 1
	beta = 0.5
	out = generator(params, vqgan_model)
	out = norm_data(out)
	out = create_crops(out)
	out = torchvision.transforms.Normalize((0.48145466, 0.4578275, 0.40821073),
	(0.26862954, 0.26130258, 0.27577711))(out)

	img_enc = clip_model.encode_image(out)
	final_enc = w1 * prompt + w2 * extras_enc[0]
	final_text_include_enc = final_enc / final_enc.norm(dim=-1, keepdim=True)
	final_text_exclude_enc = exclude_enc[0]

	main_loss = torch.cosine_similarity(final_text_include_enc, img_enc, dim=-1)
	penalize_loss = torch.cosine_similarity(final_text_exclude_enc, img_enc, dim=-1)

	return -alpha * main_loss.mean() + beta * penalize_loss.mean()

	def optimize(params, optimizer, prompt, vqgan_model, clip_model, w1, w2, extras_enc, exclude_enc):
	loss = optimize_result(params, prompt, vqgan_model, clip_model, w1, w2, extras_enc, exclude_enc)
	optimizer.zero_grad()
	loss.backward()
	optimizer.step()
	return loss

	def training_loop(params, optimizer, include_enc, exclude_enc, extras_enc, vqgan_model, clip_model, w1, w2,
	total_iter=200, show_step=1):
	res_img = []
	res_z = []

	for prompt in include_enc:
	for it in range(total_iter):
	loss = optimize(params, optimizer, prompt, vqgan_model, clip_model, w1, w2, extras_enc, exclude_enc)

	if it >= 0 and it % show_step == 0:
	with torch.no_grad():
	generated = generator(params, vqgan_model)
	new_img = norm_data(generated[0].to(device))
	res_img.append(new_img)
	res_z.append(params.clone().detach())
	print(f"loss: {loss.item():.4f}\nno. of iteration: {it}")

	torch.cuda.empty_cache()
	return res_img, res_z

	def generate_art(include_text, exclude_text, extras_text, num_iterations):
	try:
	# Process the input prompts
	include = [x.strip() for x in include_text.split(',')]
	exclude = [x.strip() for x in exclude_text.split(',')]
	extras = [x.strip() for x in extras_text.split(',')]

	w1, w2 = 1.0, 0.9

	# Setup models
	clip_model = setup_clip_model()
	vqgan_model = setup_vqgan_model("./models/vqgan_imagenet_f16_16384/configs/model.yaml",
	"./models/vqgan_imagenet_f16_16384/checkpoints/last.ckpt")

	# Parameters
	learning_rate = 0.1
	batch_size = 1
	wd = 0.1
	size1, size2 = 225, 400

	# Initialize parameters
	initial_image = PIL.Image.open('./gradient1.png')
	initial_image = initial_image.resize((size2, size1))
	initial_image = torchvision.transforms.ToTensor()(initial_image).unsqueeze(0).to(device)

	with torch.no_grad():
	z, _, _ = vqgan_model.encode(initial_image)

	params = torch.nn.Parameter(z).to(device)
	optimizer = torch.optim.AdamW([params], lr=learning_rate, weight_decay=wd)
	params.data = params.data * 0.6 + torch.randn_like(params.data) * 0.4

	# Encode prompts
	include_enc, exclude_enc, extras_enc = create_encoding(include, exclude, extras, clip_model)

	# Run training loop
	res_img, res_z = training_loop(params, optimizer, include_enc, exclude_enc, extras_enc,
	vqgan_model, clip_model, w1, w2, total_iter=num_iterations)

	# Save results
	output_dir = "generated"
	# Create output directory if it doesn't exist
	os.makedirs(output_dir, exist_ok=True)

	# Clear any existing files in the output directory
	for file in os.listdir(output_dir):
	file_path = os.path.join(output_dir, file)
	if os.path.isfile(file_path):
	os.remove(file_path)

	for i, img in enumerate(res_img):
	save_from_tensors(img, output_dir, f"generated_image_{i:03d}.png")

	# Create video
	video_path = create_video()

	# Delete the generated folder and its contents after creating the video
	import shutil
	shutil.rmtree(output_dir)

	return video_path

	except Exception as e:
	# If there's an error, ensure the generated folder is cleaned up
	if os.path.exists("generated"):
	import shutil
	shutil.rmtree("generated")
	raise e # Re-raise the exception to be handled by the calling function
	def gradio_interface(include_text, exclude_text, extras_text, num_iterations):
	try:
	video_path = generate_art(include_text, exclude_text, extras_text, int(num_iterations))
	return video_path
	except Exception as e:
	return f"An error occurred: {str(e)}"

	# Define and launch the Gradio app
	iface = gr.Interface(
	fn=gradio_interface,
	inputs=[
	gr.Textbox(label="Include Prompts (comma-separated)",
	value="desert, heavy rain, cactus"),
	gr.Textbox(label="Exclude Prompts (comma-separated)",
	value="confusing, blurry"),
	gr.Textbox(label="Extra Style Prompts (comma-separated)",
	value="desert, clear, detailed, beautiful, good shape, detailed"),
	gr.Number(label="Number of Iterations",
	value=200, minimum=1, maximum=1000)
	],
	outputs=gr.Video(label="Generated Morphing Video"),
	title="VQGAN-CLIP Art Generator",
	css="allow",
	allow_flagging="never",
	#######
	description="""
	<a href="https://colab.research.google.com/drive/1ivRYvTaX90PRghQIqAdOyEawkY0YLefa?authuser=0#scrollTo=WE7aPQ0t1hd2">
	<img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
	</a>
	<a href="https://huggingface.co./spaces/your-username/your-space-name?duplicate=true">
	<img src="https://huggingface.co./datasets/huggingface/badges/raw/main/clone-space-lg.svg" alt="Clone Space"/>
	</a>
	<br><br>
	Generate artistic videos using VQGAN-CLIP.
	Enter your prompts separated by commas and adjust the number of iterations.
	The model will generate a morphing video based on your inputs.
	<br><br>
	<b>Note:</b> This application requires GPU access. Please either:
	<br>1. Use the Colab notebook (click the Colab badge above) with GPU runtime
	<br>2. Clone this space (click Clone Space badge) and enable GPU in your personal copy"""
	)

	if __name__ == "__main__":
	print("Checking GPU availability:", "GPU AVAILABLE" if torch.cuda.is_available() else "NO GPU FOUND")
	iface.launch()