Warlord-K commited on
Commit
9f98d7f
1 Parent(s): 99b9405

Add in AV Combiner and Text Preprocessor

Browse files
Videobook/AVCombiner.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shutil
2
+ from gtts import gTTS
3
+ from io import BytesIO
4
+ from tempfile import NamedTemporaryFile
5
+ from moviepy.editor import AudioFileClip, ImageClip, CompositeVideoClip, concatenate_videoclips
6
+
7
+ class AVCombiner:
8
+
9
+ def __call__(self, images, texts, output_path):
10
+ """Create and save a video file to `output_path` after concatenating
11
+ static images and audio clips generated from the text in `image_text_pairs`"""
12
+
13
+ clips = []
14
+ for i,image in enumerate(images):
15
+ text = texts[i]
16
+
17
+ try:
18
+ # create the audio clip object from the text
19
+ audio_bytes = BytesIO()
20
+ gTTS(text=text).write_to_fp(audio_bytes)
21
+ audio_bytes.seek(0)
22
+ with NamedTemporaryFile(suffix=".mp3", delete=False) as audio_tempfile:
23
+ audio_tempfile.write(audio_bytes.read())
24
+ audio_clip = AudioFileClip(audio_tempfile.name)
25
+
26
+ # create the image clip object
27
+ image_clip = ImageClip(image)
28
+
29
+ # set the duration of the image clip to be 200ms
30
+ image_clip = image_clip.set_duration(audio_clip.duration+2)
31
+
32
+ # create a composite clip with the image and audio clips
33
+ video_clip = CompositeVideoClip([image_clip.set_audio(audio_clip.set_start(1))])
34
+
35
+ # set the FPS to 1
36
+ video_clip.fps = 1
37
+
38
+ clips.append(video_clip)
39
+ except Exception as e:
40
+ print(e)
41
+
42
+ # concatenate all the video clips
43
+ final_clip = concatenate_videoclips(clips)
44
+
45
+ # Create a NamedTemporaryFile to store the video data
46
+ with NamedTemporaryFile(suffix=".mp4", delete=False) as video_tempfile:
47
+ # Write the video data to the temporary file
48
+ final_clip.write_videofile(video_tempfile.name)
49
+
50
+ # Copy the temporary file to the output path
51
+ shutil.copy(video_tempfile.name, output_path)
52
+ return f"{output_path}/{video_tempfile.name}"
Videobook/TextPreprocessor.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import nltk
2
+ import spacy
3
+
4
+ class TextPreprocessor:
5
+ """
6
+ Class that Preprocesses text for the pipeline
7
+ Converts text by performing coreference, splitting text into postive and negative, then generate CLIP Embeddings.
8
+ """
9
+ def __init__(self):
10
+
11
+ nltk.download('punkt')
12
+ self.nlp = spacy.load("en_core_web_sm")
13
+ self.nlp.add_pipe(
14
+ "fastcoref",
15
+ config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref', 'device': 'cpu'}
16
+ )
17
+
18
+ def coref(self, text = None):
19
+ '''
20
+ Does Coreference Resolution
21
+ Parameters:
22
+ text: the input paragraph whose coreference is to be resolved. Default: Alice goes down the rabbit hole. Where she would discover a new reality beyond her expectations.
23
+
24
+ Returns:
25
+ Coreference Resolved paragraph
26
+
27
+ '''
28
+ if not text:
29
+ text = 'Alice goes down the rabbit hole. Where she would discover a new reality beyond her expectations.'
30
+ doc = self.nlp(
31
+ text,
32
+ component_cfg={"fastcoref": {'resolve_text': True}}
33
+ )
34
+ # Check doc._.coref_clusters for cluster info
35
+ return doc._.resolved_text
36
+
37
+ def neg_prompt(self,string : str):
38
+ """
39
+ Splits Text Into Postive an Negative Prompt.
40
+ """
41
+ positive = " "
42
+ negative = " "
43
+ words = nltk.word_tokenize(string)
44
+ for i, word in enumerate(words[:-1]):
45
+ if words[i+1].lower() not in ["n't", 'not']:
46
+ positive += " " + word
47
+ else:
48
+ for wor in words[i+2:]:
49
+ negative += " " + wor
50
+ return {'pos':positive, 'neg': negative}
51
+ if(words!=[]):
52
+ positive+=words[-1]
53
+ return {'pos':positive, 'neg': negative}
54
+
55
+ def __call__(self, text):
56
+ coref_text = self.coref(text)
57
+ sentences = nltk.sent_tokenize(coref_text)
58
+ processed_sentences = []
59
+ for sentence in sentences:
60
+ processed_sentences.append(self.neg_prompt(sentence))
61
+ return processed_sentences, sentences
Videobook/Videobook.py CHANGED
@@ -1,61 +1,59 @@
1
  import os
2
  import cv2
3
  import numpy as np
4
- from gtts import gTTS
5
- from mutagen.mp3 import MP3
6
- import nltk
7
  import ffmpeg
8
- nltk.download('punkt')
9
- from nltk.tokenize import sent_tokenize
10
  from math import ceil
11
  from segmindapi import SD2_1, Kadinsky
12
  import gradio as gr
13
-
 
14
  class Videobook:
15
-
 
 
16
  def get_sentences(self, story):
17
- return sent_tokenize(story)
18
 
19
- def generate_voice(self, story, sentences , path = 'tts.mp3'):
20
- for i,n in enumerate(sentences):
21
- tts=gTTS(n,lang='en')
22
- tts.save('tts'+str(i)+'.mp3')
23
- lgth=[]
24
- for i in range(len(sentences)):
25
- lgth.append(MP3('tts'+str(i)+'.mp3').info.length)
26
- os.remove(os.path.join(os.getcwd(),'tts'+str(i)+'.mp3'))
27
- tts=gTTS(story,lang='en')
28
- tts.save(path)
29
- return lgth
30
 
31
  def generate_imgs(self, sentences, steps):
32
  imgs = []
33
  for sentence in sentences:
34
  sentence = self.style + ' of ' + sentence + ', ' + self.tags
35
- imgs.append(self.pipe.generate(sentence, num_inference_steps = steps))
36
  return imgs
37
 
38
- def addBuffer(self, imgs, lgth):
39
- imgs_buff = []
40
- for i,img in enumerate(imgs):
41
- for j in range(ceil(lgth[i] * self.fps)):
42
- imgs_buff.append(img)
43
- return imgs_buff
44
 
45
- def imgs_to_video(self, imgs, video_name='video.mp4'):
46
- video_dims = (imgs[0].width, imgs[0].height)
47
- fourcc = cv2.VideoWriter_fourcc(*'DIVX')
48
- video = cv2.VideoWriter(video_name, fourcc, self.fps, video_dims)
49
- for img in imgs:
50
- tmp_img = img.copy()
51
- video.write(cv2.cvtColor(np.array(tmp_img), cv2.COLOR_RGB2BGR))
52
- video.release()
53
 
54
- def make_video(self, imgs, lengths, video_name = "finished_video.mp4"):
55
- self.imgs_to_video(self.addBuffer(imgs, lengths), 'test_video.mp4')
56
- input_audio = ffmpeg.input(os.path.join(os.getcwd(),'tts.mp3'))
57
- input_video = ffmpeg.input(os.path.join(os.getcwd(),'test_video.mp4'))
58
- ffmpeg.concat(input_video, input_audio, v=1, a=1).output(video_name).run(overwrite_output=True)
59
 
60
 
61
  def generate(self, story, api_key, fps, style, tags, model, steps):
@@ -66,8 +64,5 @@ class Videobook:
66
  self.pipe = SD2_1(api_key)
67
  else:
68
  self.pipe = Kadinsky(api_key)
69
- sentences = self.get_sentences(story)
70
- lengths = self.generate_voice(story, sentences)
71
- images = self.generate_imgs(sentences, steps)
72
- self.make_video(images, lengths)
73
- return "finished_video.mp4"
 
1
  import os
2
  import cv2
3
  import numpy as np
 
 
 
4
  import ffmpeg
 
 
5
  from math import ceil
6
  from segmindapi import SD2_1, Kadinsky
7
  import gradio as gr
8
+ from .TextPreprocessor import TextPreprocessor
9
+ from .AVCombiner import AVCombiner
10
  class Videobook:
11
+ def __init__(self):
12
+ self.preprocessor = TextPreprocessor()
13
+ self.combiner = AVCombiner()
14
  def get_sentences(self, story):
15
+ return self.preprocessor(story)
16
 
17
+ # def generate_voice(self, story, sentences , path = 'tts.mp3'):
18
+ # for i,n in enumerate(sentences):
19
+ # tts=gTTS(n,lang='en')
20
+ # tts.save('tts'+str(i)+'.mp3')
21
+ # lgth=[]
22
+ # for i in range(len(sentences)):
23
+ # lgth.append(MP3('tts'+str(i)+'.mp3').info.length)
24
+ # os.remove(os.path.join(os.getcwd(),'tts'+str(i)+'.mp3'))
25
+ # tts=gTTS(story,lang='en')
26
+ # tts.save(path)
27
+ # return lgth
28
 
29
  def generate_imgs(self, sentences, steps):
30
  imgs = []
31
  for sentence in sentences:
32
  sentence = self.style + ' of ' + sentence + ', ' + self.tags
33
+ imgs.append(self.pipe.generate(prompt = sentence['pos'], negative_prompt = sentence['neg'], num_inference_steps = steps))
34
  return imgs
35
 
36
+ # def addBuffer(self, imgs, lgth):
37
+ # imgs_buff = []
38
+ # for i,img in enumerate(imgs):
39
+ # for j in range(ceil(lgth[i] * self.fps)):
40
+ # imgs_buff.append(img)
41
+ # return imgs_buff
42
 
43
+ # def imgs_to_video(self, imgs, video_name='video.mp4'):
44
+ # video_dims = (imgs[0].width, imgs[0].height)
45
+ # fourcc = cv2.VideoWriter_fourcc(*'DIVX')
46
+ # video = cv2.VideoWriter(video_name, fourcc, self.fps, video_dims)
47
+ # for img in imgs:
48
+ # tmp_img = img.copy()
49
+ # video.write(cv2.cvtColor(np.array(tmp_img), cv2.COLOR_RGB2BGR))
50
+ # video.release()
51
 
52
+ # def make_video(self, imgs, lengths, video_name = "finished_video.mp4"):
53
+ # self.imgs_to_video(self.addBuffer(imgs, lengths), 'test_video.mp4')
54
+ # input_audio = ffmpeg.input(os.path.join(os.getcwd(),'tts.mp3'))
55
+ # input_video = ffmpeg.input(os.path.join(os.getcwd(),'test_video.mp4'))
56
+ # ffmpeg.concat(input_video, input_audio, v=1, a=1).output(video_name).run(overwrite_output=True)
57
 
58
 
59
  def generate(self, story, api_key, fps, style, tags, model, steps):
 
64
  self.pipe = SD2_1(api_key)
65
  else:
66
  self.pipe = Kadinsky(api_key)
67
+ processed_sentences, sentences = self.get_sentences(story)
68
+ return AVCombiner()(self.generate_imgs(processed_sentences, steps), sentences, os.getcwd())
 
 
 
requirements.txt CHANGED
@@ -5,4 +5,7 @@ nltk
5
  ffmpeg-python
6
  opencv-python
7
  numpy
8
- gradio
 
 
 
 
5
  ffmpeg-python
6
  opencv-python
7
  numpy
8
+ gradio
9
+ spacy
10
+ fastcoref
11
+ moviepy