Free-View_Expressive_Talking_Head_Video_Editing

Running on Zero

App Files Files Community

[email protected] commited on Aug 24, 2023

Commit

eae1cca

1 Parent(s): 48798aa

update

Browse files

Files changed (11) hide show

app.py +4 -4
assets/coords/sample1.npz +2 -2
assets/coords/sample2.npz +2 -2
assets/coords/sample3.npz +2 -2
assets/coords/sample4.npz +2 -2
assets/coords/sample5.npz +3 -0
assets/videos/sample5.mp4 +0 -0
attributtes_utils.py +13 -11
fete_model.py +13 -13
inference_util.py +5 -1
preprocess_videos.py +7 -3

app.py CHANGED Viewed

@@ -35,10 +35,9 @@ available_audios = natsorted(glob.glob("./assets/audios/*.wav"))
 available_audios = [os.path.basename(x) for x in available_audios]
 with gr.Blocks() as demo:
     gr.HTML(
-            """
             <h1 style="text-align: center; font-size: 40px; font-family: 'Times New Roman', Times, serif;">
                 Free-View Expressive Talking Head Video Editing
             </h1>
@@ -51,7 +50,8 @@ with gr.Blocks() as demo:
                     <img src="https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm.svg#center" alt="Duplicate Space">
                     </a>
             </p>
-            """)
     with gr.Column(elem_id="col-container"):
         with gr.Row():
             with gr.Column():
@@ -80,4 +80,4 @@ with gr.Blocks() as demo:
     audio_input.select(lambda x: "./assets/audios/" + x, audio_input, audio_preview_output)
     submit_btn.click(process, inputs, outputs)
-demo.queue(max_size=12).launch()

 available_audios = [os.path.basename(x) for x in available_audios]
 with gr.Blocks() as demo:
     gr.HTML(
+        """
             <h1 style="text-align: center; font-size: 40px; font-family: 'Times New Roman', Times, serif;">
                 Free-View Expressive Talking Head Video Editing
             </h1>
                     <img src="https://huggingface.co/datasets/huggingface/badges/raw/main/duplicate-this-space-sm.svg#center" alt="Duplicate Space">
                     </a>
             </p>
+            """
+    )
     with gr.Column(elem_id="col-container"):
         with gr.Row():
             with gr.Column():
     audio_input.select(lambda x: "./assets/audios/" + x, audio_input, audio_preview_output)
     submit_btn.click(process, inputs, outputs)
+demo.queue(max_size=10).launch()

assets/coords/sample1.npz CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5b200f395b09505d61f3efb67feaacbbd5bb358e75b476c4da083e4a7cef58af
-size 525

 version https://git-lfs.github.com/spec/v1
+oid sha256:05569fbc982413520a7f81636cba156e6f67344c1cd13f4831ccd95cbb1bf0ad
+size 454

assets/coords/sample2.npz CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3ac70dd3972f406d9e8195283d11395a7b1e2528bdbdec4a3420eeac919489c9
-size 909

 version https://git-lfs.github.com/spec/v1
+oid sha256:37790e74ae602e20aa3be2811f60cda21905cc1a88d6efc53f99ec9a73f7e1df
+size 810

assets/coords/sample3.npz CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:246e4910d5ae9937f2d692beb6d6267dcb2f09bf7b7e0bd75d373a167289cf08
-size 598

 version https://git-lfs.github.com/spec/v1
+oid sha256:20b7d61ba04d8743c5d0a26b235897df9664b5dc095d96a17be3b9cdbfb06142
+size 528

assets/coords/sample4.npz CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:497b14d4185a447327fac69602b66997dc791ff333ead12680c36e3e27d20195
-size 656

 version https://git-lfs.github.com/spec/v1
+oid sha256:79b016ed5c7934c7e667dc6999457726175b17d4ef0fb67a755c5d038f0b75ec
+size 567

assets/coords/sample5.npz ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5db52100e8db15ec88b4b3e4d0229fcbdcf96f623b6f4eb8890f7915b430b914
+size 655

assets/videos/sample5.mp4 ADDED Viewed

Binary file (439 kB). View file

attributtes_utils.py CHANGED Viewed

@@ -5,43 +5,45 @@ import sys
 def input_pose(pose_select="front"):
     step = 1
     if pose_select == "front":
-        pose = [[0.0, 0.0, 0.0] for i in range(0, 10, step)]#-20 to 20
     elif pose_select == "left_right_shaking":
-        pose = [[-i, 0.0, 0.0] for i in range(0, 20, step)]#0 to -20
         pose += [[i - 20.0, 0.0, 0.0] for i in range(0, 40, step)]  # -20 to 20
         pose += [[20.0 - i, 0.0, 0.0] for i in range(0, 20, step)]  # 20 to 0
         pose = pose + pose
         pose = pose + pose
         pose = pose + pose
-        # pose = pose + pose[::-1]
     else:
         raise ValueError("pose_select Error")
     return pose
-EMOTIONS = ['angry', 'disgust', 'fear', 'happy', 'sad', 'surprise', 'neutral']
 def input_emotion(emotion_select="neutral"):
     sacle_factor = 2
     if emotion_select == "neutral":
-        emotion = [[0.0,0.0,0.0,0.0,0.0,0.0,1.0] for _ in range(2)]#((i%50))*0.04
     elif emotion_select == "happy":
-        emotion = [[0.0,0.0,0.0,1.0,0.0,0.0,0.0] for _ in range(2)]#((i%50))*0.04
     elif emotion_select == "angry":
-        emotion = [[1.0,0.0,0.0,0.0,0.0,0.0,0.0] for _ in range(2)]
     elif emotion_select == "surprised":
-        emotion = [[0.0,0.0,0.0,0.0,0.0,1.0,0.0] for _ in range(2)]
     else:
         raise ValueError("emotion_select Error")
-    return emotion * sacle_factor
 def input_blink(blink_select="yes"):
     if blink_select == "yes":
-        blink = [[1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [0.8], [0.6], [0.0], [0.0], [1.0]]
         blink = blink + blink + blink
     else:
         blink = [[1.0] for _ in range(2)]
     return blink

 def input_pose(pose_select="front"):
     step = 1
     if pose_select == "front":
+        pose = [[0.0, 0.0, 0.0] for i in range(0, 10, step)]  # -20 to 20
     elif pose_select == "left_right_shaking":
+        pose = [[-i, 0.0, 0.0] for i in range(0, 20, step)]  # 0 to -20
         pose += [[i - 20.0, 0.0, 0.0] for i in range(0, 40, step)]  # -20 to 20
         pose += [[20.0 - i, 0.0, 0.0] for i in range(0, 20, step)]  # 20 to 0
         pose = pose + pose
         pose = pose + pose
         pose = pose + pose
     else:
         raise ValueError("pose_select Error")
     return pose
+EMOTIONS = ["angry", "disgust", "fear", "happy", "sad", "surprise", "neutral"]
 def input_emotion(emotion_select="neutral"):
     sacle_factor = 2
     if emotion_select == "neutral":
+        emotion = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0] for _ in range(2)]  # ((i%50))*0.04
     elif emotion_select == "happy":
+        emotion = [[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0] for _ in range(2)]  # ((i%50))*0.04
     elif emotion_select == "angry":
+        emotion = [[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0] for _ in range(2)]
     elif emotion_select == "surprised":
+        emotion = [[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0] for _ in range(2)]
     else:
         raise ValueError("emotion_select Error")
+    return emotion * sacle_factor
 def input_blink(blink_select="yes"):
     if blink_select == "yes":
+        blink = [[1.0] for _ in range(25)]
+        blink += [[0.8], [0.6], [0.0], [0.0]]
+        blink += [[1.0] for _ in range(5)]
         blink = blink + blink + blink
     else:
         blink = [[1.0] for _ in range(2)]
     return blink

fete_model.py CHANGED Viewed

@@ -206,11 +206,11 @@ class FETE_model(nn.Module):
         #     face_sequences = torch.cat([face_sequences[:, :, i] for i in range(face_sequences.size(2))], dim=0)
         # print(audio_sequences.size(), face_sequences.size(), pose_sequences.size(), emotion_sequences.size())
-        audio_embedding   = self.audio_encoder(audio_sequences)  # B,                                                     512,  1, 1
-        pose_embedding    = self.pose_encoder(pose_sequences)  # B,                                                       512,  1, 1
         emotion_embedding = self.emotion_encoder(emotion_sequences)  # B,                                                 512,  1, 1
-        blink_embedding   = self.blink_encoder(blink_sequences)  # B,                                                     512,  1, 1
-        inputs_embedding  = torch.cat((audio_embedding, pose_embedding, emotion_embedding, blink_embedding), dim=1)  # B, 1536, 1, 1
         # print(audio_embedding.size(), pose_embedding.size(), emotion_embedding.size(), inputs_embedding.size())
         feats = []
@@ -261,10 +261,10 @@ class Self_Attention(nn.Module):
         """
         super(Self_Attention, self).__init__()
         self.query_conv = nn.Conv2d(in_channels=in_planes_s, out_channels=in_planes_s // 8, kernel_size=1)
-        self.key_conv   = nn.Conv2d(in_channels=in_planes_r, out_channels=in_planes_r // 8, kernel_size=1)
         self.value_conv = nn.Conv2d(in_channels=in_planes_r, out_channels=in_planes_r, kernel_size=1)
-        self.gamma      = nn.Parameter(torch.zeros(1))
-        self.softmax    = nn.Softmax(dim=-1)
     def forward(self, source):
         source = source.float() if isinstance(source, torch.cuda.HalfTensor) else source
@@ -286,11 +286,11 @@ class Self_Attention(nn.Module):
         r_batchsize, rC, rH, rW = reference.size()
         proj_query = self.query_conv(source).view(s_batchsize, -1, sH * sW).permute(0, 2, 1)
-        proj_key   = self.key_conv(reference).view(r_batchsize, -1, rW * rH)
-        energy     = torch.bmm(proj_query, proj_key)
-        attention  = self.softmax(energy)
         proj_value = self.value_conv(reference).view(r_batchsize, -1, rH * rW)
-        out        = torch.bmm(proj_value, attention.permute(0, 2, 1))
-        out        = out.view(s_batchsize, sC, sH, sW)
-        out        = self.gamma * out + source
         return out.half() if isinstance(source, torch.cuda.FloatTensor) else out

         #     face_sequences = torch.cat([face_sequences[:, :, i] for i in range(face_sequences.size(2))], dim=0)
         # print(audio_sequences.size(), face_sequences.size(), pose_sequences.size(), emotion_sequences.size())
+        audio_embedding = self.audio_encoder(audio_sequences)  # B,                                                     512,  1, 1
+        pose_embedding = self.pose_encoder(pose_sequences)  # B,                                                       512,  1, 1
         emotion_embedding = self.emotion_encoder(emotion_sequences)  # B,                                                 512,  1, 1
+        blink_embedding = self.blink_encoder(blink_sequences)  # B,                                                     512,  1, 1
+        inputs_embedding = torch.cat((audio_embedding, pose_embedding, emotion_embedding, blink_embedding), dim=1)  # B, 1536, 1, 1
         # print(audio_embedding.size(), pose_embedding.size(), emotion_embedding.size(), inputs_embedding.size())
         feats = []
         """
         super(Self_Attention, self).__init__()
         self.query_conv = nn.Conv2d(in_channels=in_planes_s, out_channels=in_planes_s // 8, kernel_size=1)
+        self.key_conv = nn.Conv2d(in_channels=in_planes_r, out_channels=in_planes_r // 8, kernel_size=1)
         self.value_conv = nn.Conv2d(in_channels=in_planes_r, out_channels=in_planes_r, kernel_size=1)
+        self.gamma = nn.Parameter(torch.zeros(1))
+        self.softmax = nn.Softmax(dim=-1)
     def forward(self, source):
         source = source.float() if isinstance(source, torch.cuda.HalfTensor) else source
         r_batchsize, rC, rH, rW = reference.size()
         proj_query = self.query_conv(source).view(s_batchsize, -1, sH * sW).permute(0, 2, 1)
+        proj_key = self.key_conv(reference).view(r_batchsize, -1, rW * rH)
+        energy = torch.bmm(proj_query, proj_key)
+        attention = self.softmax(energy)
         proj_value = self.value_conv(reference).view(r_batchsize, -1, rH * rW)
+        out = torch.bmm(proj_value, attention.permute(0, 2, 1))
+        out = out.view(s_batchsize, sC, sH, sW)
+        out = self.gamma * out + source
         return out.half() if isinstance(source, torch.cuda.FloatTensor) else out

inference_util.py CHANGED Viewed

@@ -280,7 +280,11 @@ def infenrece(model, face_path, audio_path, pose, emotion, blink, preview=False)
     else:
         outfile = "/tmp/{}.mp4".format(timestamp)
         tmp_video = "/tmp/temp_{}.mp4".format(timestamp)
-        writer = imageio.get_writer(tmp_video, fps=fps, codec="libx264", quality=10, pixelformat="yuv420p", macro_block_size=1) if not preview else None
     # print('Generating frames...', outfile, steps)
     for inputs, frames, coords in tqdm(gen, total=steps):
         with torch.no_grad():

     else:
         outfile = "/tmp/{}.mp4".format(timestamp)
         tmp_video = "/tmp/temp_{}.mp4".format(timestamp)
+        writer = (
+            imageio.get_writer(tmp_video, fps=fps, codec="libx264", quality=10, pixelformat="yuv420p", macro_block_size=1)
+            if not preview
+            else None
+        )
     # print('Generating frames...', outfile, steps)
     for inputs, frames, coords in tqdm(gen, total=steps):
         with torch.no_grad():

preprocess_videos.py CHANGED Viewed

@@ -9,6 +9,7 @@ from natsort import natsorted
 device = "cuda" if torch.cuda.is_available() else "cpu"
 def get_squre_coords(coords, image, size=None, last_size=None):
     y1, y2, x1, x2 = coords
     w, h = x2 - x1, y2 - y1
@@ -63,7 +64,7 @@ def face_detect(images, pads):
         x1 = max(0, rect[0] - padx1)
         x2 = min(image.shape[1], rect[2] + padx2)
         # y_gap, x_gap = ((y2 - y1) * 2) // 3, ((x2 - x1) * 2) // 3
-        y_gap, x_gap = (y2 - y1)//2, (x2 - x1)//2
         coords_ = [y1 - y_gap, y2 + y_gap, x1 - x_gap, x2 + x_gap]
         _, coords = get_squre_coords(coords_, image)
@@ -79,18 +80,20 @@ def face_detect(images, pads):
     print("Number of frames cropped: {}".format(len(results)))
     print("First coords: {}".format(results[0]))
     boxes = np.array(results)
-    boxes = get_smoothened_boxes(boxes, T=15)
     # results = [[image[y1:y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)]
     del detector
     return boxes
 def add_black(imgs):
     for i in range(len(imgs)):
         imgs[i] = cv2.vconcat([np.zeros((100, imgs[i].shape[1], 3), dtype=np.uint8), imgs[i], np.zeros((20, imgs[i].shape[1], 3), dtype=np.uint8)])
     return imgs
 def preprocess(video_dir="./assets/videos", save_dir="./assets/coords"):
     all_videos = natsorted(glob.glob(os.path.join(video_dir, "*.mp4")))
     for video_path in all_videos:
@@ -115,5 +118,6 @@ def load_from_npz(video_name, save_dir="./assets/coords"):
     npz = np.load(os.path.join(save_dir, video_name + ".npz"))
     return npz["coords"]
 if __name__ == "__main__":
-    preprocess()

 device = "cuda" if torch.cuda.is_available() else "cpu"
 def get_squre_coords(coords, image, size=None, last_size=None):
     y1, y2, x1, x2 = coords
     w, h = x2 - x1, y2 - y1
         x1 = max(0, rect[0] - padx1)
         x2 = min(image.shape[1], rect[2] + padx2)
         # y_gap, x_gap = ((y2 - y1) * 2) // 3, ((x2 - x1) * 2) // 3
+        y_gap, x_gap = (y2 - y1) // 2, (x2 - x1) // 2
         coords_ = [y1 - y_gap, y2 + y_gap, x1 - x_gap, x2 + x_gap]
         _, coords = get_squre_coords(coords_, image)
     print("Number of frames cropped: {}".format(len(results)))
     print("First coords: {}".format(results[0]))
     boxes = np.array(results)
+    boxes = get_smoothened_boxes(boxes, T=25)
     # results = [[image[y1:y2, x1:x2], (y1, y2, x1, x2)] for image, (x1, y1, x2, y2) in zip(images, boxes)]
     del detector
     return boxes
 def add_black(imgs):
     for i in range(len(imgs)):
         imgs[i] = cv2.vconcat([np.zeros((100, imgs[i].shape[1], 3), dtype=np.uint8), imgs[i], np.zeros((20, imgs[i].shape[1], 3), dtype=np.uint8)])
     return imgs
 def preprocess(video_dir="./assets/videos", save_dir="./assets/coords"):
     all_videos = natsorted(glob.glob(os.path.join(video_dir, "*.mp4")))
     for video_path in all_videos:
     npz = np.load(os.path.join(save_dir, video_name + ".npz"))
     return npz["coords"]
 if __name__ == "__main__":
+    preprocess()