Videobook_v1.5

Runtime error

App Files Files Community

Warlord-K commited on Jul 4, 2023

Commit

9f98d7f

•

1 Parent(s): 99b9405

Add in AV Combiner and Text Preprocessor

Browse files

Files changed (4) hide show

Videobook/AVCombiner.py +52 -0
Videobook/TextPreprocessor.py +61 -0
Videobook/Videobook.py +39 -44
requirements.txt +4 -1

Videobook/AVCombiner.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import shutil
+from gtts import gTTS
+from io import BytesIO
+from tempfile import NamedTemporaryFile
+from moviepy.editor import AudioFileClip, ImageClip, CompositeVideoClip, concatenate_videoclips
+class AVCombiner:
+  def __call__(self, images, texts, output_path):
+    """Create and save a video file to `output_path` after concatenating
+    static images and audio clips generated from the text in `image_text_pairs`"""
+    clips = []
+    for i,image in enumerate(images):
+      text = texts[i]
+      try:
+            # create the audio clip object from the text
+            audio_bytes = BytesIO()
+            gTTS(text=text).write_to_fp(audio_bytes)
+            audio_bytes.seek(0)
+            with NamedTemporaryFile(suffix=".mp3", delete=False) as audio_tempfile:
+                audio_tempfile.write(audio_bytes.read())
+            audio_clip = AudioFileClip(audio_tempfile.name)
+            # create the image clip object
+            image_clip = ImageClip(image)
+            # set the duration of the image clip to be 200ms
+            image_clip = image_clip.set_duration(audio_clip.duration+2)
+            # create a composite clip with the image and audio clips
+            video_clip = CompositeVideoClip([image_clip.set_audio(audio_clip.set_start(1))])
+            # set the FPS to 1
+            video_clip.fps = 1
+            clips.append(video_clip)
+      except Exception as e:
+            print(e)
+    # concatenate all the video clips
+    final_clip = concatenate_videoclips(clips)
+    # Create a NamedTemporaryFile to store the video data
+    with NamedTemporaryFile(suffix=".mp4", delete=False) as video_tempfile:
+        # Write the video data to the temporary file
+        final_clip.write_videofile(video_tempfile.name)
+        # Copy the temporary file to the output path
+        shutil.copy(video_tempfile.name, output_path)
+    return f"{output_path}/{video_tempfile.name}"

Videobook/TextPreprocessor.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import nltk
+import spacy
+class TextPreprocessor:
+  """
+  Class that Preprocesses text for the pipeline
+  Converts text by performing coreference, splitting text into postive and negative, then generate CLIP Embeddings.
+  """
+  def __init__(self):
+    nltk.download('punkt')
+    self.nlp = spacy.load("en_core_web_sm")
+    self.nlp.add_pipe(
+      "fastcoref",
+      config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref', 'device': 'cpu'}
+    )
+  def coref(self, text = None):
+    '''
+        Does Coreference Resolution
+        Parameters:
+        text: the input paragraph whose coreference is to be resolved. Default: Alice goes down the rabbit hole. Where she would discover a new reality beyond her expectations.
+        Returns:
+        Coreference Resolved paragraph
+    '''
+    if not text:
+      text = 'Alice goes down the rabbit hole. Where she would discover a new reality beyond her expectations.'
+    doc = self.nlp(
+      text,
+      component_cfg={"fastcoref": {'resolve_text': True}}
+    )
+    # Check doc._.coref_clusters for cluster info
+    return doc._.resolved_text
+  def neg_prompt(self,string : str):
+        """
+        Splits Text Into Postive an Negative Prompt.
+        """
+        positive = " "
+        negative = " "
+        words = nltk.word_tokenize(string)
+        for i, word in enumerate(words[:-1]):
+            if words[i+1].lower() not in ["n't", 'not']:
+                positive += " " + word
+            else:
+                for wor in words[i+2:]:
+                    negative += " " + wor
+                return {'pos':positive, 'neg': negative}
+        if(words!=[]):
+            positive+=words[-1]
+        return {'pos':positive, 'neg': negative}
+  def __call__(self, text):
+    coref_text = self.coref(text)
+    sentences = nltk.sent_tokenize(coref_text)
+    processed_sentences = []
+    for sentence in sentences:
+        processed_sentences.append(self.neg_prompt(sentence))
+    return processed_sentences, sentences

Videobook/Videobook.py CHANGED Viewed

@@ -1,61 +1,59 @@
 import os
 import cv2
 import numpy as np
-from gtts import gTTS
-from mutagen.mp3 import MP3
-import nltk
 import ffmpeg
-nltk.download('punkt')
-from nltk.tokenize import sent_tokenize
 from math import ceil
 from segmindapi import SD2_1, Kadinsky
 import gradio as gr
 class Videobook:
   def get_sentences(self, story):
-    return sent_tokenize(story)
-  def generate_voice(self, story, sentences , path = 'tts.mp3'):
-    for i,n in enumerate(sentences):
-        tts=gTTS(n,lang='en')
-        tts.save('tts'+str(i)+'.mp3')
-    lgth=[]
-    for i in range(len(sentences)):
-        lgth.append(MP3('tts'+str(i)+'.mp3').info.length)
-        os.remove(os.path.join(os.getcwd(),'tts'+str(i)+'.mp3'))
-    tts=gTTS(story,lang='en')
-    tts.save(path)
-    return lgth
   def generate_imgs(self, sentences, steps):
     imgs = []
     for sentence in sentences:
       sentence = self.style + ' of ' + sentence + ', ' + self.tags
-      imgs.append(self.pipe.generate(sentence, num_inference_steps = steps))
     return imgs
-  def addBuffer(self, imgs, lgth):
-    imgs_buff = []
-    for i,img in enumerate(imgs):
-      for j in range(ceil(lgth[i] * self.fps)):
-        imgs_buff.append(img)
-    return imgs_buff
-  def imgs_to_video(self, imgs, video_name='video.mp4'):
-    video_dims = (imgs[0].width, imgs[0].height)
-    fourcc = cv2.VideoWriter_fourcc(*'DIVX')
-    video = cv2.VideoWriter(video_name, fourcc, self.fps, video_dims)
-    for img in imgs:
-      tmp_img = img.copy()
-      video.write(cv2.cvtColor(np.array(tmp_img), cv2.COLOR_RGB2BGR))
-    video.release()
-  def make_video(self, imgs, lengths, video_name = "finished_video.mp4"):
-    self.imgs_to_video(self.addBuffer(imgs, lengths), 'test_video.mp4')
-    input_audio = ffmpeg.input(os.path.join(os.getcwd(),'tts.mp3'))
-    input_video = ffmpeg.input(os.path.join(os.getcwd(),'test_video.mp4'))
-    ffmpeg.concat(input_video, input_audio, v=1, a=1).output(video_name).run(overwrite_output=True)
   def generate(self, story, api_key, fps, style, tags, model, steps):
@@ -66,8 +64,5 @@ class Videobook:
       self.pipe = SD2_1(api_key)
     else:
       self.pipe = Kadinsky(api_key)
-    sentences = self.get_sentences(story)
-    lengths = self.generate_voice(story, sentences)
-    images = self.generate_imgs(sentences, steps)
-    self.make_video(images, lengths)
-    return "finished_video.mp4"

 import os
 import cv2
 import numpy as np
 import ffmpeg
 from math import ceil
 from segmindapi import SD2_1, Kadinsky
 import gradio as gr
+from .TextPreprocessor import TextPreprocessor
+from .AVCombiner import AVCombiner
 class Videobook:
+  def __init__(self):
+    self.preprocessor = TextPreprocessor()
+    self.combiner = AVCombiner()
   def get_sentences(self, story):
+    return self.preprocessor(story)
+  # def generate_voice(self, story, sentences , path = 'tts.mp3'):
+  #   for i,n in enumerate(sentences):
+  #       tts=gTTS(n,lang='en')
+  #       tts.save('tts'+str(i)+'.mp3')
+  #   lgth=[]
+  #   for i in range(len(sentences)):
+  #       lgth.append(MP3('tts'+str(i)+'.mp3').info.length)
+  #       os.remove(os.path.join(os.getcwd(),'tts'+str(i)+'.mp3'))
+  #   tts=gTTS(story,lang='en')
+  #   tts.save(path)
+  #   return lgth
   def generate_imgs(self, sentences, steps):
     imgs = []
     for sentence in sentences:
       sentence = self.style + ' of ' + sentence + ', ' + self.tags
+      imgs.append(self.pipe.generate(prompt = sentence['pos'], negative_prompt = sentence['neg'], num_inference_steps = steps))
     return imgs
+  # def addBuffer(self, imgs, lgth):
+  #   imgs_buff = []
+  #   for i,img in enumerate(imgs):
+  #     for j in range(ceil(lgth[i] * self.fps)):
+  #       imgs_buff.append(img)
+  #   return imgs_buff
+  # def imgs_to_video(self, imgs, video_name='video.mp4'):
+  #   video_dims = (imgs[0].width, imgs[0].height)
+  #   fourcc = cv2.VideoWriter_fourcc(*'DIVX')
+  #   video = cv2.VideoWriter(video_name, fourcc, self.fps, video_dims)
+  #   for img in imgs:
+  #     tmp_img = img.copy()
+  #     video.write(cv2.cvtColor(np.array(tmp_img), cv2.COLOR_RGB2BGR))
+  #   video.release()
+  # def make_video(self, imgs, lengths, video_name = "finished_video.mp4"):
+  #   self.imgs_to_video(self.addBuffer(imgs, lengths), 'test_video.mp4')
+  #   input_audio = ffmpeg.input(os.path.join(os.getcwd(),'tts.mp3'))
+  #   input_video = ffmpeg.input(os.path.join(os.getcwd(),'test_video.mp4'))
+  #   ffmpeg.concat(input_video, input_audio, v=1, a=1).output(video_name).run(overwrite_output=True)
   def generate(self, story, api_key, fps, style, tags, model, steps):
       self.pipe = SD2_1(api_key)
     else:
       self.pipe = Kadinsky(api_key)
+    processed_sentences, sentences = self.get_sentences(story)
+    return AVCombiner()(self.generate_imgs(processed_sentences, steps), sentences, os.getcwd())

requirements.txt CHANGED Viewed

@@ -5,4 +5,7 @@ nltk
 ffmpeg-python
 opencv-python
 numpy
-gradio

 ffmpeg-python
 opencv-python
 numpy
+gradio
+spacy
+fastcoref
+moviepy