import os import cv2 import numpy as np from gtts import gTTS from mutagen.mp3 import MP3 import nltk import ffmpeg nltk.download('punkt') from nltk.tokenize import sent_tokenize from math import ceil from segmindapi import SD2_1, Kadinsky import gradio as gr class Videobook: def get_sentences(self, story): return sent_tokenize(story) def generate_voice(self, story, sentences , path = 'tts.mp3'): for i,n in enumerate(sentences): tts=gTTS(n,lang='en') tts.save('tts'+str(i)+'.mp3') lgth=[] for i in range(len(sentences)): lgth.append(MP3('tts'+str(i)+'.mp3').info.length) os.remove(os.path.join(os.getcwd(),'tts'+str(i)+'.mp3')) tts=gTTS(story,lang='en') tts.save(path) return lgth def generate_imgs(self, sentences, steps): imgs = [] for sentence in sentences: sentence = self.style + ' of ' + sentence + ', ' + self.tags imgs.append(self.pipe.generate(sentence, num_inference_steps = steps)) return imgs def addBuffer(self, imgs, lgth): imgs_buff = [] for i,img in enumerate(imgs): for j in range(ceil(lgth[i] * self.fps)): imgs_buff.append(img) return imgs_buff def imgs_to_video(self, imgs, video_name='video.mp4'): video_dims = (imgs[0].width, imgs[0].height) fourcc = cv2.VideoWriter_fourcc(*'DIVX') video = cv2.VideoWriter(video_name, fourcc, self.fps, video_dims) for img in imgs: tmp_img = img.copy() video.write(cv2.cvtColor(np.array(tmp_img), cv2.COLOR_RGB2BGR)) video.release() def make_video(self, imgs, lengths, video_name = "finished_video.mp4"): self.imgs_to_video(self.addBuffer(imgs, lengths), 'test_video.mp4') input_audio = ffmpeg.input(os.path.join(os.getcwd(),'tts.mp3')) input_video = ffmpeg.input(os.path.join(os.getcwd(),'test_video.mp4')) ffmpeg.concat(input_video, input_audio, v=1, a=1).output(video_name).run(overwrite_output=True) def generate(self, story, api_key, fps, style, tags, model, steps): self.fps = fps self.style = style self.tags = tags if model == "Stable Diffusion v2.1": self.pipe = SD2_1(api_key) else: self.pipe = Kadinsky(api_key) sentences = self.get_sentences(story) lengths = self.generate_voice(story, sentences) images = self.generate_imgs(sentences, steps) self.make_video(images, lengths) return "finished_video.mp4"