Spaces:
Runtime error
Runtime error
Upload 5 files
Browse files- animation.gif +0 -0
- modelutil.py +34 -0
- streamlitapp.py +60 -0
- test_video.mp4 +0 -0
- utils.py +52 -0
animation.gif
ADDED
modelutil.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tensorflow.python.ops.numpy_ops import np_config
|
2 |
+
np_config.enable_numpy_behavior()
|
3 |
+
import os
|
4 |
+
from tensorflow.keras.models import Sequential
|
5 |
+
from tensorflow.keras.layers import Conv3D, LSTM, Dense, Dropout, Bidirectional, MaxPool3D, Activation, Reshape, SpatialDropout3D, BatchNormalization, TimeDistributed, Flatten
|
6 |
+
|
7 |
+
def load_model() -> Sequential:
|
8 |
+
model = Sequential()
|
9 |
+
|
10 |
+
model.add(Conv3D(128, 3, input_shape=(75,46,140,1), padding='same'))
|
11 |
+
model.add(Activation('relu'))
|
12 |
+
model.add(MaxPool3D((1,2,2)))
|
13 |
+
|
14 |
+
model.add(Conv3D(256, 3, padding='same'))
|
15 |
+
model.add(Activation('relu'))
|
16 |
+
model.add(MaxPool3D((1,2,2)))
|
17 |
+
|
18 |
+
model.add(Conv3D(75, 3, padding='same'))
|
19 |
+
model.add(Activation('relu'))
|
20 |
+
model.add(MaxPool3D((1,2,2)))
|
21 |
+
|
22 |
+
model.add(TimeDistributed(Flatten()))
|
23 |
+
|
24 |
+
model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
|
25 |
+
model.add(Dropout(.5))
|
26 |
+
|
27 |
+
model.add(Bidirectional(LSTM(128, kernel_initializer='Orthogonal', return_sequences=True)))
|
28 |
+
model.add(Dropout(.5))
|
29 |
+
|
30 |
+
model.add(Dense(41, kernel_initializer='he_normal', activation='softmax'))
|
31 |
+
# print("path",os.path.join('..','models','checkpoint'))
|
32 |
+
model.load_weights(os.path.join('..','models','checkpoint'))
|
33 |
+
|
34 |
+
return model
|
streamlitapp.py
ADDED
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Import all of the dependencies
|
2 |
+
import streamlit as st
|
3 |
+
import os
|
4 |
+
import imageio
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
import tensorflow as tf
|
8 |
+
from utils import load_data, num_to_char
|
9 |
+
from modelutil import load_model
|
10 |
+
|
11 |
+
|
12 |
+
# Set the layout to the streamlit app as wide
|
13 |
+
st.set_page_config(layout='wide')
|
14 |
+
|
15 |
+
# Setup the sidebar
|
16 |
+
with st.sidebar:
|
17 |
+
st.image('https://plus.unsplash.com/premium_photo-1682309676673-392c56015c5c?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=1000&q=80')
|
18 |
+
st.title('Lip Reading')
|
19 |
+
st.info('This application is originally developed from the LipNet deep learning model.')
|
20 |
+
|
21 |
+
st.title('LipNet using StreamLit βπ»')
|
22 |
+
# Generating a list of options or videos
|
23 |
+
options = os.listdir(os.path.join('..', 'data', 's1'))
|
24 |
+
selected_video = st.selectbox('Choose video', options)
|
25 |
+
|
26 |
+
# Generate two columns
|
27 |
+
col1, col2 = st.columns(2)
|
28 |
+
|
29 |
+
if options:
|
30 |
+
|
31 |
+
# Rendering the video
|
32 |
+
with col1:
|
33 |
+
st.info('The video below displays the converted video in mp4 format')
|
34 |
+
file_path = os.path.join('..','data','s1', selected_video)
|
35 |
+
os.system(f'ffmpeg -i {file_path} -vcodec libx264 test_video.mp4 -y')
|
36 |
+
|
37 |
+
# Rendering inside of the app
|
38 |
+
video = open('test_video.mp4', 'rb')
|
39 |
+
video_bytes = video.read()
|
40 |
+
st.video(video_bytes)
|
41 |
+
|
42 |
+
|
43 |
+
with col2:
|
44 |
+
st.info('π This is all the machine learning model sees when making a prediction')
|
45 |
+
video, annotations,image_data = load_data(tf.convert_to_tensor(file_path))
|
46 |
+
# st.text(video.shape)
|
47 |
+
imageio.mimsave('animation.gif',np.squeeze((video * 50).astype(np.uint8)) , duration=100)
|
48 |
+
st.image('animation.gif', width=400)
|
49 |
+
|
50 |
+
st.info('This is the output of the machine learning model as tokens')
|
51 |
+
model = load_model()
|
52 |
+
yhat = model.predict(tf.expand_dims(video, axis=0))
|
53 |
+
decoder = tf.keras.backend.ctc_decode(yhat, [75], greedy=True)[0][0].numpy()
|
54 |
+
st.text(decoder)
|
55 |
+
|
56 |
+
# Convert prediction to text
|
57 |
+
st.info('Decode the raw tokens into words')
|
58 |
+
converted_prediction = tf.strings.reduce_join(num_to_char(decoder)).numpy().decode('utf-8')
|
59 |
+
st.text(converted_prediction)
|
60 |
+
|
test_video.mp4
ADDED
Binary file (110 kB). View file
|
|
utils.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import tensorflow as tf
|
2 |
+
from typing import List
|
3 |
+
import numpy as np
|
4 |
+
import cv2
|
5 |
+
import os
|
6 |
+
|
7 |
+
vocab = [x for x in "abcdefghijklmnopqrstuvwxyz'?!123456789 "]
|
8 |
+
char_to_num = tf.keras.layers.StringLookup(vocabulary=vocab, oov_token="")
|
9 |
+
# Mapping integers back to original characters
|
10 |
+
num_to_char = tf.keras.layers.StringLookup(
|
11 |
+
vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
|
12 |
+
)
|
13 |
+
|
14 |
+
def load_video(path:str) -> List[float]:
|
15 |
+
#print(path)
|
16 |
+
cap = cv2.VideoCapture(path)
|
17 |
+
frames = []
|
18 |
+
for _ in range(int(cap.get(cv2.CAP_PROP_FRAME_COUNT))):
|
19 |
+
ret, frame = cap.read()
|
20 |
+
frame = tf.image.rgb_to_grayscale(frame)
|
21 |
+
frames.append(frame[190:236,80:220,:])
|
22 |
+
cap.release()
|
23 |
+
|
24 |
+
mean = tf.math.reduce_mean(frames)
|
25 |
+
std = tf.math.reduce_std(tf.cast(frames, tf.float32))
|
26 |
+
return tf.cast((frames - mean), tf.float32) / std
|
27 |
+
|
28 |
+
def load_alignments(path:str) -> List[str]:
|
29 |
+
#print(path)
|
30 |
+
with open(path, 'r') as f:
|
31 |
+
lines = f.readlines()
|
32 |
+
tokens = []
|
33 |
+
for line in lines:
|
34 |
+
line = line.split()
|
35 |
+
if line[2] != 'sil':
|
36 |
+
tokens = [*tokens,' ',line[2]]
|
37 |
+
return char_to_num(tf.reshape(tf.strings.unicode_split(tokens, input_encoding='UTF-8'), (-1)))[1:]
|
38 |
+
|
39 |
+
def load_data(path: str):
|
40 |
+
path = bytes.decode(path.numpy())
|
41 |
+
file_name = path.split('/')[-1].split('.')[0]
|
42 |
+
# File name splitting for windows
|
43 |
+
file_name = path.split('\\')[-1].split('.')[0]
|
44 |
+
video_path = os.path.join('..','data','s1',f'{file_name}.mpg')
|
45 |
+
alignment_path = os.path.join('..','data','alignments','s1',f'{file_name}.align')
|
46 |
+
frames = load_video(video_path)
|
47 |
+
print(frames.shape)
|
48 |
+
alignments = load_alignments(alignment_path)
|
49 |
+
image_data = (frames * 255).astype(np.uint8)
|
50 |
+
image_data = np.squeeze(image_data)
|
51 |
+
|
52 |
+
return frames, alignments, image_data
|