import os import gradio as gr import numpy as np import soundfile as sf import torchaudio from speechbrain.pretrained.interfaces import foreign_class from app_utils import preprocess_video_and_rank from authors import AUTHORS # Importing necessary components for the Gradio app from description import DESCRIPTION_DYNAMIC # , DESCRIPTION_STATIC # import scipy.io.wavfile as wav from paraformer import AudioReader, CttPunctuator, FSMNVad, ParaformerOffline os.environ["no_proxy"] = "localhost,127.0.0.1,::1" ###########################语音部分###################################### classifier = foreign_class( source="pretrained_models/local-speechbrain/emotion-recognition-wav2vec2-IEMOCAP", # ".\\emotion-recognition-wav2vec2-IEMOCAP" pymodule_file="custom_interface.py", classname="CustomEncoderWav2vec2Classifier", savedir="pretrained_models/local-speechbrain/emotion-recognition-wav2vec2-IEMOCAP", ) ASR_model = ParaformerOffline() vad = FSMNVad() punc = CttPunctuator() def classify_continuous(audio): print(type(audio)) print(audio) sample_rate, signal = audio # 这是语音的输入 signal = signal.astype(np.float32) signal /= np.max(np.abs(signal)) sf.write("a.wav", signal, sample_rate) signal, sample_rate = torchaudio.load("a.wav") signal1 = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)( signal ) torchaudio.save("out.wav", signal1, 16000, encoding="PCM_S", bits_per_sample=16) Audio = "out.wav" speech, sample_rate = AudioReader.read_wav_file(Audio) if signal == "none": return "none", "none", "haha" else: segments = vad.segments_offline(speech) text_results = "" for part in segments: _result = ASR_model.infer_offline( speech[part[0] * 16 : part[1] * 16], hot_words="任意热词 空格分开" ) text_results += punc.punctuate(_result)[0] out_prob, score, index, text_lab = classifier.classify_batch(signal1) return text_results, out_prob.squeeze(0).numpy(), text_lab[-1] #########################################视频部分################################### def clear_dynamic_info(): return ( gr.Video(value=None), gr.Plot(value=None), gr.Textbox(""), ) ##################################设置各自的app类#################### with gr.Blocks(css="app.css") as video: with gr.Tab("Dynamic App"): gr.Markdown(value=DESCRIPTION_DYNAMIC) with gr.Row(): with gr.Column(scale=2): input_video = gr.Video( sources=["webcam", "upload"], elem_classes="video1" ) with gr.Row(): clear_btn_dynamic = gr.Button( value="Clear", interactive=True, scale=1 ) # submit_dynamic = gr.Button( # value="Submit", interactive=True, scale=1, elem_classes="submit" # ) submit_and_rank = gr.Button( value="Score", interactive=True, scale=1, elem_classes="submit" ) with gr.Column(scale=2, elem_classes="dl4"): with gr.Row(): output_score = gr.Textbox(label="scores") output_statistics = gr.Plot( label="Statistics of emotions", elem_classes="stat" ) output_audio=gr.Audio(interactive=False) gr.Examples( [ "videos/video1.mp4", "videos/video2.mp4", "videos/sample.webm", "videos/cnm.mp4", ], [input_video], ) with gr.Tab("Authors"): gr.Markdown(value=AUTHORS) clear_btn_dynamic.click( fn=clear_dynamic_info, inputs=[], outputs=[ input_video, output_statistics, output_score, ], queue=True, ) submit_and_rank.click( fn=preprocess_video_and_rank, inputs=input_video, outputs=[ output_statistics, output_score, output_audio, ], ) #################################### speech = gr.Interface( classify_continuous, gr.Audio(sources=["microphone"]), [ gr.Text(label="语音识别结果"), gr.Text(label="音频情感识别1"), gr.Text(label="音频情感识别2"), ], ) with gr.Blocks() as app: with gr.Tab("语音"): speech.render() with gr.Tab("视频"): video.render() app.launch()