import os import cv2 import numpy as np import ffmpeg from math import ceil from segmindapi import SD2_1, Kadinsky import gradio as gr from .TextPreprocessor import TextPreprocessor from .AVCombiner import AVCombiner class Videobook: def __init__(self): self.preprocessor = TextPreprocessor() self.combiner = AVCombiner() def get_sentences(self, story): return self.preprocessor(story) # def generate_voice(self, story, sentences , path = 'tts.mp3'): # for i,n in enumerate(sentences): # tts=gTTS(n,lang='en') # tts.save('tts'+str(i)+'.mp3') # lgth=[] # for i in range(len(sentences)): # lgth.append(MP3('tts'+str(i)+'.mp3').info.length) # os.remove(os.path.join(os.getcwd(),'tts'+str(i)+'.mp3')) # tts=gTTS(story,lang='en') # tts.save(path) # return lgth def generate_imgs(self, sentences, steps): imgs = [] for sentence in sentences: sentence['pos'] = self.style + ' of ' + sentence['pos'] + ', ' + self.tags imgs.append(self.pipe.generate(prompt = sentence['pos'], negative_prompt = sentence['neg'], num_inference_steps = steps)) return imgs # def addBuffer(self, imgs, lgth): # imgs_buff = [] # for i,img in enumerate(imgs): # for j in range(ceil(lgth[i] * self.fps)): # imgs_buff.append(img) # return imgs_buff # def imgs_to_video(self, imgs, video_name='video.mp4'): # video_dims = (imgs[0].width, imgs[0].height) # fourcc = cv2.VideoWriter_fourcc(*'DIVX') # video = cv2.VideoWriter(video_name, fourcc, self.fps, video_dims) # for img in imgs: # tmp_img = img.copy() # video.write(cv2.cvtColor(np.array(tmp_img), cv2.COLOR_RGB2BGR)) # video.release() # def make_video(self, imgs, lengths, video_name = "finished_video.mp4"): # self.imgs_to_video(self.addBuffer(imgs, lengths), 'test_video.mp4') # input_audio = ffmpeg.input(os.path.join(os.getcwd(),'tts.mp3')) # input_video = ffmpeg.input(os.path.join(os.getcwd(),'test_video.mp4')) # ffmpeg.concat(input_video, input_audio, v=1, a=1).output(video_name).run(overwrite_output=True) def generate(self, story, api_key, fps, style, tags, model, steps): self.fps = fps self.style = style self.tags = tags if model == "Stable Diffusion v2.1": self.pipe = SD2_1(api_key) else: self.pipe = Kadinsky(api_key) processed_sentences, sentences = self.get_sentences(story) return AVCombiner()(self.generate_imgs(processed_sentences, steps), sentences, os.getcwd())