File size: 2,530 Bytes
99b9405
 
 
 
 
 
 
9f98d7f
 
99b9405
9f98d7f
 
 
99b9405
9f98d7f
99b9405
9f98d7f
 
 
 
 
 
 
 
 
 
 
99b9405
 
 
 
 
9f98d7f
99b9405
 
9f98d7f
 
 
 
 
 
99b9405
9f98d7f
 
 
 
 
 
 
 
99b9405
9f98d7f
 
 
 
 
99b9405
 
 
 
 
 
 
 
 
 
9f98d7f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import os
import cv2
import numpy as np
import ffmpeg
from math import ceil
from segmindapi import SD2_1, Kadinsky
import gradio as gr
from .TextPreprocessor import TextPreprocessor
from .AVCombiner import AVCombiner
class Videobook:
  def __init__(self):
    self.preprocessor = TextPreprocessor()
    self.combiner = AVCombiner()
  def get_sentences(self, story):
    return self.preprocessor(story)

  # def generate_voice(self, story, sentences , path = 'tts.mp3'):
  #   for i,n in enumerate(sentences):
  #       tts=gTTS(n,lang='en')
  #       tts.save('tts'+str(i)+'.mp3')
  #   lgth=[]
  #   for i in range(len(sentences)):
  #       lgth.append(MP3('tts'+str(i)+'.mp3').info.length)
  #       os.remove(os.path.join(os.getcwd(),'tts'+str(i)+'.mp3'))
  #   tts=gTTS(story,lang='en')
  #   tts.save(path)
  #   return lgth

  def generate_imgs(self, sentences, steps):
    imgs = []
    for sentence in sentences:
      sentence = self.style + ' of ' + sentence + ', ' + self.tags
      imgs.append(self.pipe.generate(prompt = sentence['pos'], negative_prompt = sentence['neg'], num_inference_steps = steps))
    return imgs

  # def addBuffer(self, imgs, lgth):
  #   imgs_buff = []
  #   for i,img in enumerate(imgs):
  #     for j in range(ceil(lgth[i] * self.fps)):
  #       imgs_buff.append(img)
  #   return imgs_buff

  # def imgs_to_video(self, imgs, video_name='video.mp4'):
  #   video_dims = (imgs[0].width, imgs[0].height)
  #   fourcc = cv2.VideoWriter_fourcc(*'DIVX')    
  #   video = cv2.VideoWriter(video_name, fourcc, self.fps, video_dims)
  #   for img in imgs:
  #     tmp_img = img.copy()
  #     video.write(cv2.cvtColor(np.array(tmp_img), cv2.COLOR_RGB2BGR))
  #   video.release()

  # def make_video(self, imgs, lengths, video_name = "finished_video.mp4"):
  #   self.imgs_to_video(self.addBuffer(imgs, lengths), 'test_video.mp4')
  #   input_audio = ffmpeg.input(os.path.join(os.getcwd(),'tts.mp3'))
  #   input_video = ffmpeg.input(os.path.join(os.getcwd(),'test_video.mp4'))
  #   ffmpeg.concat(input_video, input_audio, v=1, a=1).output(video_name).run(overwrite_output=True)
    

  def generate(self, story, api_key, fps, style, tags, model, steps):
    self.fps = fps
    self.style = style
    self.tags = tags
    if model == "Stable Diffusion v2.1":
      self.pipe = SD2_1(api_key)
    else:
      self.pipe = Kadinsky(api_key)
    processed_sentences, sentences = self.get_sentences(story)
    return AVCombiner()(self.generate_imgs(processed_sentences, steps), sentences, os.getcwd())