shisheng7
commited on
Commit
β’
f7e8357
1
Parent(s):
2a393cc
inital update
Browse files- README.md +3 -4
- app.py +88 -0
- requirements.txt +30 -0
README.md
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
---
|
2 |
title: JoyHallo
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.44.0
|
8 |
app_file: app.py
|
@@ -10,4 +10,3 @@ pinned: false
|
|
10 |
license: mit
|
11 |
---
|
12 |
|
13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
title: JoyHallo
|
3 |
+
emoji: π
|
4 |
+
colorFrom: gray
|
5 |
+
colorTo: purple
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.44.0
|
8 |
app_file: app.py
|
|
|
10 |
license: mit
|
11 |
---
|
12 |
|
|
app.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import shutil
|
3 |
+
from huggingface_hub import snapshot_download
|
4 |
+
import gradio as gr
|
5 |
+
os.chdir(os.path.dirname(os.path.abspath(__file__)))
|
6 |
+
|
7 |
+
hallo_dir = snapshot_download(repo_id="fudan-generative-ai/hallo", local_dir="pretrained_models")
|
8 |
+
joyhallo_dir = snapshot_download(repo_id="jdh-algo/JoyHallo-v1", local_dir="pretrained_models/joyhallo")
|
9 |
+
wav_dir = snapshot_download(repo_id="TencentGameMate/chinese-wav2vec2-base", local_dir="pretrained_models/chinese-wav2vec2-base")
|
10 |
+
print(hallo_dir, joyhallo_dir)
|
11 |
+
print(os.listdir(hallo_dir))
|
12 |
+
|
13 |
+
from scripts.inference import predict
|
14 |
+
|
15 |
+
def run_inference(source_image, driving_audio, progress=gr.Progress(track_tqdm=True)):
|
16 |
+
|
17 |
+
return predict(source_image, driving_audio, 1.0, 1.0, 1.0, 1.2)
|
18 |
+
|
19 |
+
|
20 |
+
css = '''
|
21 |
+
div#warning-ready {
|
22 |
+
background-color: #ecfdf5;
|
23 |
+
padding: 0 16px 16px;
|
24 |
+
margin: 20px 0;
|
25 |
+
color: #030303!important;
|
26 |
+
}
|
27 |
+
div#warning-ready > .gr-prose > h2, div#warning-ready > .gr-prose > p {
|
28 |
+
color: #057857!important;
|
29 |
+
}
|
30 |
+
div#warning-duplicate {
|
31 |
+
background-color: #ebf5ff;
|
32 |
+
padding: 0 16px 16px;
|
33 |
+
margin: 20px 0;
|
34 |
+
color: #030303!important;
|
35 |
+
}
|
36 |
+
div#warning-duplicate > .gr-prose > h2, div#warning-duplicate > .gr-prose > p {
|
37 |
+
color: #0f4592!important;
|
38 |
+
}
|
39 |
+
div#warning-duplicate strong {
|
40 |
+
color: #0f4592;
|
41 |
+
}
|
42 |
+
p.actions {
|
43 |
+
display: flex;
|
44 |
+
align-items: center;
|
45 |
+
margin: 20px 0;
|
46 |
+
}
|
47 |
+
div#warning-duplicate .actions a {
|
48 |
+
display: inline-block;
|
49 |
+
margin-right: 10px;
|
50 |
+
}
|
51 |
+
.dark #warning-duplicate {
|
52 |
+
background-color: #0c0c0c !important;
|
53 |
+
border: 1px solid white !important;
|
54 |
+
}
|
55 |
+
'''
|
56 |
+
|
57 |
+
with gr.Blocks(css=css) as demo:
|
58 |
+
gr.Markdown("# JoyHallo: Digital human model for Mandarin")
|
59 |
+
gr.Markdown("Generate talking head avatars driven with Mandarin speech.")
|
60 |
+
gr.Markdown("""
|
61 |
+
|
62 |
+
Data requirements:
|
63 |
+
|
64 |
+
Image:
|
65 |
+
1. Cropped to square shape.
|
66 |
+
2. Face should be facing forward and occupy 50%-70% of the image area.
|
67 |
+
|
68 |
+
Audio:
|
69 |
+
1. Audio in wav format.
|
70 |
+
2. Mandarin or English or mixed, with clear audio and suitable background music.
|
71 |
+
|
72 |
+
! Important: Too long audio will casue a very long processing time, please keep the audio length within 5s.
|
73 |
+
""")
|
74 |
+
with gr.Row():
|
75 |
+
with gr.Column():
|
76 |
+
avatar_face = gr.Image(type="filepath", label="Face")
|
77 |
+
driving_audio = gr.Audio(type="filepath", label="Driving audio")
|
78 |
+
generate = gr.Button("Generate")
|
79 |
+
with gr.Column():
|
80 |
+
output_video = gr.Video(label="Your talking head")
|
81 |
+
|
82 |
+
generate.click(
|
83 |
+
fn=run_inference,
|
84 |
+
inputs=[avatar_face, driving_audio],
|
85 |
+
outputs=output_video
|
86 |
+
)
|
87 |
+
|
88 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
accelerate==0.28.0
|
2 |
+
audio-separator==0.17.2
|
3 |
+
av==12.1.0
|
4 |
+
bitsandbytes==0.43.1
|
5 |
+
decord==0.6.0
|
6 |
+
diffusers==0.27.2
|
7 |
+
einops==0.8.0
|
8 |
+
insightface==0.7.3
|
9 |
+
librosa==0.10.2.post1
|
10 |
+
mediapipe[vision]==0.10.14
|
11 |
+
mlflow==2.13.1
|
12 |
+
moviepy==1.0.3
|
13 |
+
numpy==1.26.4
|
14 |
+
omegaconf==2.3.0
|
15 |
+
onnx2torch==1.5.14
|
16 |
+
onnx==1.16.1
|
17 |
+
onnxruntime==1.18.0
|
18 |
+
opencv-contrib-python==4.9.0.80
|
19 |
+
opencv-python-headless==4.9.0.80
|
20 |
+
opencv-python==4.9.0.80
|
21 |
+
pillow==10.3.0
|
22 |
+
setuptools==70.0.0
|
23 |
+
torch==2.2.2
|
24 |
+
torchvision==0.17.2
|
25 |
+
tqdm==4.66.4
|
26 |
+
transformers==4.39.2
|
27 |
+
xformers==0.0.25.post1
|
28 |
+
isort==5.13.2
|
29 |
+
pylint==3.2.2
|
30 |
+
pre-commit==3.7.1
|