Spaces:
Runtime error
Runtime error
Add in AV Combiner and Text Preprocessor
Browse files- Videobook/AVCombiner.py +52 -0
- Videobook/TextPreprocessor.py +61 -0
- Videobook/Videobook.py +39 -44
- requirements.txt +4 -1
Videobook/AVCombiner.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import shutil
|
2 |
+
from gtts import gTTS
|
3 |
+
from io import BytesIO
|
4 |
+
from tempfile import NamedTemporaryFile
|
5 |
+
from moviepy.editor import AudioFileClip, ImageClip, CompositeVideoClip, concatenate_videoclips
|
6 |
+
|
7 |
+
class AVCombiner:
|
8 |
+
|
9 |
+
def __call__(self, images, texts, output_path):
|
10 |
+
"""Create and save a video file to `output_path` after concatenating
|
11 |
+
static images and audio clips generated from the text in `image_text_pairs`"""
|
12 |
+
|
13 |
+
clips = []
|
14 |
+
for i,image in enumerate(images):
|
15 |
+
text = texts[i]
|
16 |
+
|
17 |
+
try:
|
18 |
+
# create the audio clip object from the text
|
19 |
+
audio_bytes = BytesIO()
|
20 |
+
gTTS(text=text).write_to_fp(audio_bytes)
|
21 |
+
audio_bytes.seek(0)
|
22 |
+
with NamedTemporaryFile(suffix=".mp3", delete=False) as audio_tempfile:
|
23 |
+
audio_tempfile.write(audio_bytes.read())
|
24 |
+
audio_clip = AudioFileClip(audio_tempfile.name)
|
25 |
+
|
26 |
+
# create the image clip object
|
27 |
+
image_clip = ImageClip(image)
|
28 |
+
|
29 |
+
# set the duration of the image clip to be 200ms
|
30 |
+
image_clip = image_clip.set_duration(audio_clip.duration+2)
|
31 |
+
|
32 |
+
# create a composite clip with the image and audio clips
|
33 |
+
video_clip = CompositeVideoClip([image_clip.set_audio(audio_clip.set_start(1))])
|
34 |
+
|
35 |
+
# set the FPS to 1
|
36 |
+
video_clip.fps = 1
|
37 |
+
|
38 |
+
clips.append(video_clip)
|
39 |
+
except Exception as e:
|
40 |
+
print(e)
|
41 |
+
|
42 |
+
# concatenate all the video clips
|
43 |
+
final_clip = concatenate_videoclips(clips)
|
44 |
+
|
45 |
+
# Create a NamedTemporaryFile to store the video data
|
46 |
+
with NamedTemporaryFile(suffix=".mp4", delete=False) as video_tempfile:
|
47 |
+
# Write the video data to the temporary file
|
48 |
+
final_clip.write_videofile(video_tempfile.name)
|
49 |
+
|
50 |
+
# Copy the temporary file to the output path
|
51 |
+
shutil.copy(video_tempfile.name, output_path)
|
52 |
+
return f"{output_path}/{video_tempfile.name}"
|
Videobook/TextPreprocessor.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import nltk
|
2 |
+
import spacy
|
3 |
+
|
4 |
+
class TextPreprocessor:
|
5 |
+
"""
|
6 |
+
Class that Preprocesses text for the pipeline
|
7 |
+
Converts text by performing coreference, splitting text into postive and negative, then generate CLIP Embeddings.
|
8 |
+
"""
|
9 |
+
def __init__(self):
|
10 |
+
|
11 |
+
nltk.download('punkt')
|
12 |
+
self.nlp = spacy.load("en_core_web_sm")
|
13 |
+
self.nlp.add_pipe(
|
14 |
+
"fastcoref",
|
15 |
+
config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref', 'device': 'cpu'}
|
16 |
+
)
|
17 |
+
|
18 |
+
def coref(self, text = None):
|
19 |
+
'''
|
20 |
+
Does Coreference Resolution
|
21 |
+
Parameters:
|
22 |
+
text: the input paragraph whose coreference is to be resolved. Default: Alice goes down the rabbit hole. Where she would discover a new reality beyond her expectations.
|
23 |
+
|
24 |
+
Returns:
|
25 |
+
Coreference Resolved paragraph
|
26 |
+
|
27 |
+
'''
|
28 |
+
if not text:
|
29 |
+
text = 'Alice goes down the rabbit hole. Where she would discover a new reality beyond her expectations.'
|
30 |
+
doc = self.nlp(
|
31 |
+
text,
|
32 |
+
component_cfg={"fastcoref": {'resolve_text': True}}
|
33 |
+
)
|
34 |
+
# Check doc._.coref_clusters for cluster info
|
35 |
+
return doc._.resolved_text
|
36 |
+
|
37 |
+
def neg_prompt(self,string : str):
|
38 |
+
"""
|
39 |
+
Splits Text Into Postive an Negative Prompt.
|
40 |
+
"""
|
41 |
+
positive = " "
|
42 |
+
negative = " "
|
43 |
+
words = nltk.word_tokenize(string)
|
44 |
+
for i, word in enumerate(words[:-1]):
|
45 |
+
if words[i+1].lower() not in ["n't", 'not']:
|
46 |
+
positive += " " + word
|
47 |
+
else:
|
48 |
+
for wor in words[i+2:]:
|
49 |
+
negative += " " + wor
|
50 |
+
return {'pos':positive, 'neg': negative}
|
51 |
+
if(words!=[]):
|
52 |
+
positive+=words[-1]
|
53 |
+
return {'pos':positive, 'neg': negative}
|
54 |
+
|
55 |
+
def __call__(self, text):
|
56 |
+
coref_text = self.coref(text)
|
57 |
+
sentences = nltk.sent_tokenize(coref_text)
|
58 |
+
processed_sentences = []
|
59 |
+
for sentence in sentences:
|
60 |
+
processed_sentences.append(self.neg_prompt(sentence))
|
61 |
+
return processed_sentences, sentences
|
Videobook/Videobook.py
CHANGED
@@ -1,61 +1,59 @@
|
|
1 |
import os
|
2 |
import cv2
|
3 |
import numpy as np
|
4 |
-
from gtts import gTTS
|
5 |
-
from mutagen.mp3 import MP3
|
6 |
-
import nltk
|
7 |
import ffmpeg
|
8 |
-
nltk.download('punkt')
|
9 |
-
from nltk.tokenize import sent_tokenize
|
10 |
from math import ceil
|
11 |
from segmindapi import SD2_1, Kadinsky
|
12 |
import gradio as gr
|
13 |
-
|
|
|
14 |
class Videobook:
|
15 |
-
|
|
|
|
|
16 |
def get_sentences(self, story):
|
17 |
-
return
|
18 |
|
19 |
-
def generate_voice(self, story, sentences , path = 'tts.mp3'):
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
|
31 |
def generate_imgs(self, sentences, steps):
|
32 |
imgs = []
|
33 |
for sentence in sentences:
|
34 |
sentence = self.style + ' of ' + sentence + ', ' + self.tags
|
35 |
-
imgs.append(self.pipe.generate(sentence, num_inference_steps = steps))
|
36 |
return imgs
|
37 |
|
38 |
-
def addBuffer(self, imgs, lgth):
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
|
45 |
-
def imgs_to_video(self, imgs, video_name='video.mp4'):
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
|
54 |
-
def make_video(self, imgs, lengths, video_name = "finished_video.mp4"):
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
|
60 |
|
61 |
def generate(self, story, api_key, fps, style, tags, model, steps):
|
@@ -66,8 +64,5 @@ class Videobook:
|
|
66 |
self.pipe = SD2_1(api_key)
|
67 |
else:
|
68 |
self.pipe = Kadinsky(api_key)
|
69 |
-
sentences = self.get_sentences(story)
|
70 |
-
|
71 |
-
images = self.generate_imgs(sentences, steps)
|
72 |
-
self.make_video(images, lengths)
|
73 |
-
return "finished_video.mp4"
|
|
|
1 |
import os
|
2 |
import cv2
|
3 |
import numpy as np
|
|
|
|
|
|
|
4 |
import ffmpeg
|
|
|
|
|
5 |
from math import ceil
|
6 |
from segmindapi import SD2_1, Kadinsky
|
7 |
import gradio as gr
|
8 |
+
from .TextPreprocessor import TextPreprocessor
|
9 |
+
from .AVCombiner import AVCombiner
|
10 |
class Videobook:
|
11 |
+
def __init__(self):
|
12 |
+
self.preprocessor = TextPreprocessor()
|
13 |
+
self.combiner = AVCombiner()
|
14 |
def get_sentences(self, story):
|
15 |
+
return self.preprocessor(story)
|
16 |
|
17 |
+
# def generate_voice(self, story, sentences , path = 'tts.mp3'):
|
18 |
+
# for i,n in enumerate(sentences):
|
19 |
+
# tts=gTTS(n,lang='en')
|
20 |
+
# tts.save('tts'+str(i)+'.mp3')
|
21 |
+
# lgth=[]
|
22 |
+
# for i in range(len(sentences)):
|
23 |
+
# lgth.append(MP3('tts'+str(i)+'.mp3').info.length)
|
24 |
+
# os.remove(os.path.join(os.getcwd(),'tts'+str(i)+'.mp3'))
|
25 |
+
# tts=gTTS(story,lang='en')
|
26 |
+
# tts.save(path)
|
27 |
+
# return lgth
|
28 |
|
29 |
def generate_imgs(self, sentences, steps):
|
30 |
imgs = []
|
31 |
for sentence in sentences:
|
32 |
sentence = self.style + ' of ' + sentence + ', ' + self.tags
|
33 |
+
imgs.append(self.pipe.generate(prompt = sentence['pos'], negative_prompt = sentence['neg'], num_inference_steps = steps))
|
34 |
return imgs
|
35 |
|
36 |
+
# def addBuffer(self, imgs, lgth):
|
37 |
+
# imgs_buff = []
|
38 |
+
# for i,img in enumerate(imgs):
|
39 |
+
# for j in range(ceil(lgth[i] * self.fps)):
|
40 |
+
# imgs_buff.append(img)
|
41 |
+
# return imgs_buff
|
42 |
|
43 |
+
# def imgs_to_video(self, imgs, video_name='video.mp4'):
|
44 |
+
# video_dims = (imgs[0].width, imgs[0].height)
|
45 |
+
# fourcc = cv2.VideoWriter_fourcc(*'DIVX')
|
46 |
+
# video = cv2.VideoWriter(video_name, fourcc, self.fps, video_dims)
|
47 |
+
# for img in imgs:
|
48 |
+
# tmp_img = img.copy()
|
49 |
+
# video.write(cv2.cvtColor(np.array(tmp_img), cv2.COLOR_RGB2BGR))
|
50 |
+
# video.release()
|
51 |
|
52 |
+
# def make_video(self, imgs, lengths, video_name = "finished_video.mp4"):
|
53 |
+
# self.imgs_to_video(self.addBuffer(imgs, lengths), 'test_video.mp4')
|
54 |
+
# input_audio = ffmpeg.input(os.path.join(os.getcwd(),'tts.mp3'))
|
55 |
+
# input_video = ffmpeg.input(os.path.join(os.getcwd(),'test_video.mp4'))
|
56 |
+
# ffmpeg.concat(input_video, input_audio, v=1, a=1).output(video_name).run(overwrite_output=True)
|
57 |
|
58 |
|
59 |
def generate(self, story, api_key, fps, style, tags, model, steps):
|
|
|
64 |
self.pipe = SD2_1(api_key)
|
65 |
else:
|
66 |
self.pipe = Kadinsky(api_key)
|
67 |
+
processed_sentences, sentences = self.get_sentences(story)
|
68 |
+
return AVCombiner()(self.generate_imgs(processed_sentences, steps), sentences, os.getcwd())
|
|
|
|
|
|
requirements.txt
CHANGED
@@ -5,4 +5,7 @@ nltk
|
|
5 |
ffmpeg-python
|
6 |
opencv-python
|
7 |
numpy
|
8 |
-
gradio
|
|
|
|
|
|
|
|
5 |
ffmpeg-python
|
6 |
opencv-python
|
7 |
numpy
|
8 |
+
gradio
|
9 |
+
spacy
|
10 |
+
fastcoref
|
11 |
+
moviepy
|