from PIL import Image from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,pipeline import requests import gradio as gr import torch import re from moviepy.editor import * import os import sys from huggingface_hub import snapshot_download import base64 import io import cv2 image_gen = gr.Interface.load("spaces/multimodalart/latentdiffusion") description = "Just upload an image, and generate a short story for the image.\n PS: GPT-2 is not perfect but it's fun to play with.May take a minute for the output to generate. Enjoyy!!!" title = "Story generator from images using ViT and GPT2" tokenizer = AutoTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6") model = AutoModelForSeq2SeqLM.from_pretrained("sshleifer/distilbart-cnn-12-6") def get_output_video(text): inputs = tokenizer(text, max_length=1024, truncation=True, return_tensors="pt") summary_ids = model.generate(inputs["input_ids"]) summary = tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False) plot = list(summary[0].split('.')) generated_images = [] for senten in plot[:-1]: steps=50 width=256 height=256 num_images=3 diversity=6 image_bytes = image_gen(senten, steps, width, height, num_images, diversity) # Algo from spaces/Gradio-Blocks/latent_gpt2_story/blob/main/app.py for image in image_bytes[1]: image_str = image[0] image_str = image_str.replace("data:image/png;base64,","") decoded_bytes = base64.decodebytes(bytes(image_str, "utf-8")) img = Image.open(io.BytesIO(decoded_bytes)) generated_images.append(img) c = 0 file_names = [] for img in generated_images: f_name = 'img_'+str(c)+'.jpg' file_names.append(f_name) img = img.save(f_name) c+=1 #print(file_names) clips = [ImageClip(m).set_duration(2) for m in file_names] concat_clip = concatenate_videoclips(clips, method="compose") concat_clip.write_videofile("test.mp4", fps=24) return 'test.mp4' text = 'Once, there was a boy who became bored when he watched over the village sheep grazing on the hillside. To entertain himself, he sang out, “Wolf! Wolf! The wolf is chasing the sheep!\”.When the villagers heard the cry, they came running up the hill to drive the wolf away. But, when they arrived, they saw no wolf. The boy was amused when seeing their angry faces.Don’t scream wolf, boy,\” warned the villagers, “when there is no wolf!” They angrily went back down the hill.Later, the shepherd boy cried out once again, “Wolf! Wolf! The wolf is chasing the sheep!” To his amusement, he looked on as the villagers came running up the hill to scare the wolf away.As they saw there was no wolf, they said strictly, “Save your frightened cry for when there really is a wolf! Don’t cry ‘wolf’ when there is no wolf!” But the boy grinned at their words while they walked grumbling down the hill once more.Later, the boy saw a real wolf sneaking around his flock. Alarmed, he jumped on his feet and cried out as loud as he could, “Wolf! Wolf!” But the villagers thought he was fooling them again, and so they didn’t come to help.At sunset, the villagers went looking for the boy who hadn’t returned with their sheep. When they went up the hill, they found him weeping.“There really was a wolf here! The flock is gone! I cried out, ‘Wolf!’ but you didn’t come,” he wailed.An old man went to comfort the boy. As he put his arm around him, he said, “Nobody believes a liar, even when he is telling the truth!\"' demo = gr.Blocks() with demo: gr.Markdown("# A System pipeline to generate bite-sized video from long stories") gr.Markdown("A story can be input by user. The story is summarized using DistillBART model. Then, it's step by step sent to the multimodal Diffusion model, to generate images.These are depicted as a video.") with gr.Row(): # Left column (inputs) with gr.Column(): input_start_text = gr.Textbox(value=text, label="Type your story here, for now a sample story is added already!") with gr.Row(): button_gen_video = gr.Button("Generate Video") # Right column (outputs) with gr.Column(): output_interpolation = gr.Video(label="Generated Video") gr.Markdown("

Future Works and Challenges

") gr.Markdown("Though this pipeline, isn't 100% perfect, but one can use similar system to create bite-sized videos from text resources. Effective for creating videos for educational lessons.") button_gen_video.click(fn=get_output_video, inputs=input_start_text, outputs=output_interpolation) demo.launch(debug=False)