Liusuthu's picture
Update app.py
5e7226b verified
raw
history blame
4.69 kB
import os
import gradio as gr
import numpy as np
import soundfile as sf
import torchaudio
from speechbrain.pretrained.interfaces import foreign_class
from app_utils import preprocess_video_and_rank
from authors import AUTHORS
# Importing necessary components for the Gradio app
from description import DESCRIPTION_DYNAMIC # , DESCRIPTION_STATIC
# import scipy.io.wavfile as wav
from paraformer import AudioReader, CttPunctuator, FSMNVad, ParaformerOffline
os.environ["no_proxy"] = "localhost,127.0.0.1,::1"
###########################语音部分######################################
classifier = foreign_class(
source="pretrained_models/local-speechbrain/emotion-recognition-wav2vec2-IEMOCAP", # ".\\emotion-recognition-wav2vec2-IEMOCAP"
pymodule_file="custom_interface.py",
classname="CustomEncoderWav2vec2Classifier",
savedir="pretrained_models/local-speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
)
ASR_model = ParaformerOffline()
vad = FSMNVad()
punc = CttPunctuator()
def classify_continuous(audio):
print(type(audio))
print(audio)
sample_rate, signal = audio # 这是语音的输入
signal = signal.astype(np.float32)
signal /= np.max(np.abs(signal))
sf.write("a.wav", signal, sample_rate)
signal, sample_rate = torchaudio.load("a.wav")
signal1 = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(
signal
)
torchaudio.save("out.wav", signal1, 16000, encoding="PCM_S", bits_per_sample=16)
Audio = "out.wav"
speech, sample_rate = AudioReader.read_wav_file(Audio)
if signal == "none":
return "none", "none", "haha"
else:
segments = vad.segments_offline(speech)
text_results = ""
for part in segments:
_result = ASR_model.infer_offline(
speech[part[0] * 16 : part[1] * 16], hot_words="任意热词 空格分开"
)
text_results += punc.punctuate(_result)[0]
out_prob, score, index, text_lab = classifier.classify_batch(signal1)
return text_results, out_prob.squeeze(0).numpy(), text_lab[-1]
#########################################视频部分###################################
def clear_dynamic_info():
return (
gr.Video(value=None),
gr.Plot(value=None),
gr.Textbox(""),
)
##################################设置各自的app类####################
with gr.Blocks(css="app.css") as video:
with gr.Tab("Dynamic App"):
gr.Markdown(value=DESCRIPTION_DYNAMIC)
with gr.Row():
with gr.Column(scale=2):
input_video = gr.Video(
sources=["webcam", "upload"], elem_classes="video1"
)
with gr.Row():
clear_btn_dynamic = gr.Button(
value="Clear", interactive=True, scale=1
)
# submit_dynamic = gr.Button(
# value="Submit", interactive=True, scale=1, elem_classes="submit"
# )
submit_and_rank = gr.Button(
value="Score", interactive=True, scale=1, elem_classes="submit"
)
with gr.Column(scale=2, elem_classes="dl4"):
with gr.Row():
output_score = gr.Textbox(label="scores")
output_statistics = gr.Plot(
label="Statistics of emotions", elem_classes="stat"
)
output_audio=gr.Audio(interactive=False)
gr.Examples(
[
"videos/video1.mp4",
"videos/video2.mp4",
"videos/sample.webm",
"videos/cnm.mp4",
],
[input_video],
)
with gr.Tab("Authors"):
gr.Markdown(value=AUTHORS)
clear_btn_dynamic.click(
fn=clear_dynamic_info,
inputs=[],
outputs=[
input_video,
output_statistics,
output_score,
],
queue=True,
)
submit_and_rank.click(
fn=preprocess_video_and_rank,
inputs=input_video,
outputs=[
output_statistics,
output_score,
output_audio,
],
)
####################################
speech = gr.Interface(
classify_continuous,
gr.Audio(sources=["microphone"]),
[
gr.Text(label="语音识别结果"),
gr.Text(label="音频情感识别1"),
gr.Text(label="音频情感识别2"),
],
)
with gr.Blocks() as app:
with gr.Tab("语音"):
speech.render()
with gr.Tab("视频"):
video.render()
app.launch()