Spaces:
Runtime error
Runtime error
import os | |
import gradio as gr | |
import numpy as np | |
import soundfile as sf | |
import torchaudio | |
from speechbrain.pretrained.interfaces import foreign_class | |
from app_utils import preprocess_video_and_rank | |
from authors import AUTHORS | |
# Importing necessary components for the Gradio app | |
from description import DESCRIPTION_DYNAMIC # , DESCRIPTION_STATIC | |
# import scipy.io.wavfile as wav | |
from paraformer import AudioReader, CttPunctuator, FSMNVad, ParaformerOffline | |
os.environ["no_proxy"] = "localhost,127.0.0.1,::1" | |
###########################语音部分###################################### | |
classifier = foreign_class( | |
source="pretrained_models/local-speechbrain/emotion-recognition-wav2vec2-IEMOCAP", # ".\\emotion-recognition-wav2vec2-IEMOCAP" | |
pymodule_file="custom_interface.py", | |
classname="CustomEncoderWav2vec2Classifier", | |
savedir="pretrained_models/local-speechbrain/emotion-recognition-wav2vec2-IEMOCAP", | |
) | |
ASR_model = ParaformerOffline() | |
vad = FSMNVad() | |
punc = CttPunctuator() | |
def classify_continuous(audio): | |
print(type(audio)) | |
print(audio) | |
sample_rate, signal = audio # 这是语音的输入 | |
signal = signal.astype(np.float32) | |
signal /= np.max(np.abs(signal)) | |
sf.write("a.wav", signal, sample_rate) | |
signal, sample_rate = torchaudio.load("a.wav") | |
signal1 = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)( | |
signal | |
) | |
torchaudio.save("out.wav", signal1, 16000, encoding="PCM_S", bits_per_sample=16) | |
Audio = "out.wav" | |
speech, sample_rate = AudioReader.read_wav_file(Audio) | |
if signal == "none": | |
return "none", "none", "haha" | |
else: | |
segments = vad.segments_offline(speech) | |
text_results = "" | |
for part in segments: | |
_result = ASR_model.infer_offline( | |
speech[part[0] * 16 : part[1] * 16], hot_words="任意热词 空格分开" | |
) | |
text_results += punc.punctuate(_result)[0] | |
out_prob, score, index, text_lab = classifier.classify_batch(signal1) | |
return text_results, out_prob.squeeze(0).numpy(), text_lab[-1], Audio | |
#########################################视频部分################################### | |
def clear_dynamic_info(): | |
return ( | |
gr.Video(value=None), | |
gr.Plot(value=None), | |
gr.Textbox(""), | |
) | |
##################################设置各自的app类#################### | |
with gr.Blocks(css="app.css") as video: | |
with gr.Tab("Dynamic App"): | |
gr.Markdown(value=DESCRIPTION_DYNAMIC) | |
with gr.Row(): | |
with gr.Column(scale=2): | |
input_video = gr.Video( | |
sources=["webcam", "upload"], elem_classes="video1", format='mp4' | |
) | |
with gr.Row(): | |
clear_btn_dynamic = gr.Button( | |
value="Clear", interactive=True, scale=1 | |
) | |
# submit_dynamic = gr.Button( | |
# value="Submit", interactive=True, scale=1, elem_classes="submit" | |
# ) | |
submit_and_rank = gr.Button( | |
value="Score", interactive=True, scale=1, elem_classes="submit" | |
) | |
with gr.Column(scale=2, elem_classes="dl4"): | |
with gr.Row(): | |
output_score = gr.Textbox(label="scores") | |
output_statistics = gr.Plot( | |
label="Statistics of emotions", elem_classes="stat" | |
) | |
output_audio=gr.Audio(interactive=False) | |
audio_test_button=gr.Button("分析语音") | |
out1=gr.Textbox(label="语音分析结果") | |
out2=gr.Textbox(label="音频情感识别1") | |
out3=gr.Textbox(label="音频情感识别2") | |
internal_audio=gr.Audio(interactive=False) | |
gr.Examples( | |
[ | |
"videos/video1.mp4", | |
"videos/video2.mp4", | |
"videos/sample.webm", | |
"videos/cnm.mp4", | |
], | |
[input_video], | |
) | |
with gr.Tab("Authors"): | |
gr.Markdown(value=AUTHORS) | |
clear_btn_dynamic.click( | |
fn=clear_dynamic_info, | |
inputs=[], | |
outputs=[ | |
input_video, | |
output_statistics, | |
output_score, | |
], | |
queue=True, | |
) | |
submit_and_rank.click( | |
fn=preprocess_video_and_rank, | |
inputs=input_video, | |
outputs=[ | |
output_statistics, | |
output_score, | |
output_audio, | |
], | |
) | |
audio_test_button.click( | |
fn=classify_continuous, | |
inputs=output_audio, | |
outputs=[out1,out2,out3,internal_audio] | |
) | |
#################################### | |
speech = gr.Interface( | |
classify_continuous, | |
gr.Audio(sources=["microphone"]), | |
[ | |
gr.Text(label="语音识别结果"), | |
gr.Text(label="音频情感识别1"), | |
gr.Text(label="音频情感识别2"), | |
], | |
) | |
with gr.Blocks() as app: | |
with gr.Tab("语音"): | |
speech.render() | |
with gr.Tab("视频"): | |
video.render() | |
app.launch() | |