Spaces:

Liusuthu
/

Portable-Depression-Detecting-System

Runtime error

File size: 10,577 Bytes

import os

import gradio as gr
import numpy as np
import soundfile as sf
import torchaudio
from speechbrain.pretrained.interfaces import foreign_class

from app_utils import preprocess_video_and_rank,video_score
from authors import AUTHORS

# Importing necessary components for the Gradio app
from description import DESCRIPTION_DYNAMIC  # , DESCRIPTION_STATIC

# import scipy.io.wavfile as wav
from paraformer import AudioReader, CttPunctuator, FSMNVad, ParaformerOffline

from gradio_client import Client
client = Client("Liusuthu/TextDepression")

os.environ["no_proxy"] = "localhost,127.0.0.1,::1"


###########################语音部分######################################
classifier = foreign_class(
    source="pretrained_models/local-speechbrain/emotion-recognition-wav2vec2-IEMOCAP",  # ".\\emotion-recognition-wav2vec2-IEMOCAP"
    pymodule_file="custom_interface.py",
    classname="CustomEncoderWav2vec2Classifier",
    savedir="pretrained_models/local-speechbrain/emotion-recognition-wav2vec2-IEMOCAP",
)
ASR_model = ParaformerOffline()
vad = FSMNVad()
punc = CttPunctuator()

def text_api(text:str):
    result = client.predict(
        text,  # str  in '输入文字' Textbox component
        api_name="/predict",
    )
    return result


def classify_continuous(audio):
    print(type(audio))
    print(audio)
    sample_rate, signal = audio  # 这是语音的输入
    signal = signal.astype(np.float32)
    signal /= np.max(np.abs(signal))
    sf.write("data/a.wav", signal, sample_rate)
    signal, sample_rate = torchaudio.load("data/a.wav")
    signal1 = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(
        signal
    )
    torchaudio.save("data/out.wav", signal1, 16000, encoding="PCM_S", bits_per_sample=16)
    Audio = "data/out.wav"
    speech, sample_rate = AudioReader.read_wav_file(Audio)
    if signal == "none":
        return "none", "none", "haha"
    else:
        segments = vad.segments_offline(speech)
        text_results = ""
        for part in segments:
            _result = ASR_model.infer_offline(
                speech[part[0] * 16 : part[1] * 16], hot_words="任意热词 空格分开"
            )
            text_results += punc.punctuate(_result)[0]

        out_prob, score, index, text_lab = classifier.classify_batch(signal1)
        print(type(out_prob.squeeze(0).numpy()))
        print(out_prob.squeeze(0).numpy())
        print(type(text_lab[-1]))
        print(text_lab[-1])
        return text_results, out_prob.squeeze(0).numpy(), text_lab[-1], Audio


def speech_score(audio):
    print(type(audio))
    print(audio)
    sample_rate, signal = audio  # 这是语音的输入
    signal = signal.astype(np.float32)
    signal /= np.max(np.abs(signal))
    sf.write("data/a.wav", signal, sample_rate)
    signal, sample_rate = torchaudio.load("data/a.wav")
    signal1 = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(
        signal
    )
    torchaudio.save("data/out.wav", signal1, 16000, encoding="PCM_S", bits_per_sample=16)
    Audio = "data/out.wav"
    speech, sample_rate = AudioReader.read_wav_file(Audio)
    if signal == "none":
        return "none", "none", "haha"
    else:
        segments = vad.segments_offline(speech)
        text_results = ""
        for part in segments:
            _result = ASR_model.infer_offline(
                speech[part[0] * 16 : part[1] * 16], hot_words="任意热词 空格分开"
            )
            text_results += punc.punctuate(_result)[0]

        out_prob, score, index, text_lab = classifier.classify_batch(signal1)
        print(type(out_prob.squeeze(0).numpy()))
        print(out_prob.squeeze(0).numpy())
        print(type(text_lab[-1]))
        print(text_lab[-1])
        #return text_results, out_prob.squeeze(0).numpy(), text_lab[-1], Audio
        prob=out_prob.squeeze(0).numpy()
        print(prob)
        score2=10*prob[0]-10*prob[1]
        print("score2",score2)
        print(text_lab[-1])
        text_emo=text_api(text_results)
        print(text_emo)

        return score2,text_emo
        
#########################################视频部分###################################
def clear_dynamic_info():
    return (
        gr.Video(value=None),
        gr.Plot(value=None),
        gr.Textbox(""),
    )

def clear_video():
    return (
        gr.Video(value=None),
        gr.Number(value=None),
        gr.Number(value=None),
        gr.Textbox("")
    )
##################################设置各自的app类####################
with gr.Blocks(css="app.css") as video:                
    with gr.Tab("Dynamic App"):
        gr.Markdown(value=DESCRIPTION_DYNAMIC)
        with gr.Row():
            with gr.Column(scale=2):
                input_video = gr.Video(
                    sources=["webcam", "upload"], elem_classes="video1", format='mp4'
                )
                with gr.Row():
                    clear_btn_dynamic = gr.Button(
                        value="Clear", interactive=True, scale=1
                    )
                    # submit_dynamic = gr.Button(
                    #     value="Submit", interactive=True, scale=1, elem_classes="submit"
                    # )
                    submit_and_rank = gr.Button(
                        value="Score", interactive=True, scale=1, elem_classes="submit"
                    )
            with gr.Column(scale=2, elem_classes="dl4"):
                with gr.Row():
                    output_score = gr.Textbox(label="scores")
                output_statistics = gr.Plot(
                    label="Statistics of emotions", elem_classes="stat"
                )
                output_audio=gr.Audio(interactive=False)
                audio_test_button=gr.Button("分析语音")
                out1=gr.Textbox(label="语音分析结果")
                out2=gr.Textbox(label="音频情感识别1")
                out3=gr.Textbox(label="音频情感识别2")
                text_test_button=gr.Button("分析文本")
                text_result=gr.Textbox(interactive=False)
        gr.Examples(
            [
                "videos/video1.mp4",
                "videos/video2.mp4",
                "videos/sample.webm",
                "videos/cnm.mp4",
            ],
            [input_video],
        )

    with gr.Tab("Authors"):
        gr.Markdown(value=AUTHORS)

    clear_btn_dynamic.click(
        fn=clear_dynamic_info,
        inputs=[],
        outputs=[
            input_video,
            output_statistics,
            output_score,
        ],
        queue=True,
    )
    submit_and_rank.click(
        fn=preprocess_video_and_rank,
        inputs=input_video,
        outputs=[
            output_statistics,
            output_score,
            output_audio,
        ],
    )
    audio_test_button.click(
        fn=classify_continuous,
        inputs=output_audio,
        outputs=[out1,out2,out3]
    )
    text_test_button.click(
        fn=text_api,
        inputs=out1,
        outputs=text_result,
    )

####################################
speech = gr.Interface(
    classify_continuous,
    gr.Audio(sources=["microphone"]),
    [
        gr.Text(label="语音识别结果"),
        gr.Text(label="音频情感识别1"),
        gr.Text(label="音频情感识别2"),
    ],
)
############################################################
with gr.Blocks() as video_all:
    with gr.Row():
        with gr.Column(scale=2):
            input_video = gr.Video(
                sources=["webcam"], elem_classes="video1", format='mp4'
            )
            with gr.Row():
                clear_1 = gr.Button(
                    value="Clear", interactive=True, scale=1
                )
                submit_1 = gr.Button(
                    value="Score", interactive=True, scale=1, elem_classes="submit"
                )
        with gr.Column(scale=2):
            with gr.Row():
                score1=gr.Number(interactive=False,label="score1")
            with gr.Row():
                score2=gr.Number(interactive=False,label="score2")
            with gr.Row():
                result3=gr.Textbox(interactive=False)

    clear_1.click(
        fn=clear_video,
        inputs=[],
        outputs=[input_video,score1,score2,result3]
    )
    submit_1.click(
        fn=video_score,
        inputs=[input_video],
        outputs=[score1,score2,result3],
    )
###################################################################
def clear_2():
    return (
        gr.Audio(value=None),
        gr.Textbox(""),
        gr.Textbox(""),
    )


with gr.Blocks() as speech_all:
    with gr.Row():
        with gr.Column(scale=2):
            input_audio=gr.Audio(sources="microphone")
            with gr.Row():
                clear_audio = gr.Button(
                    value="Clear", interactive=True, scale=1
                )
                submit_audio = gr.Button(
                    value="Score", interactive=True, scale=1, elem_classes="submit"
                )
        with gr.Column(scale=2):
            score2=gr.Textbox(interactive=False,label="score2")
            text_emo=gr.Textbox(interactive=False,label="text_emo")

    clear_audio.click(
        fn=clear_2,
        outputs=[input_audio,score2,text_emo]
    )
    submit_audio.click(
        fn=speech_score,
        inputs=[input_audio],
        outputs=[score2,text_emo],
    )
###################################################################

def clear_3():
    return gr.Textbox(""),gr.Textbox("")
def text_score(text):
    result=text_api(text)
    return result

with gr.Blocks() as text_all:
    with gr.Row():
        with gr.Column(scale=2):
            input_text=gr.Textbox(label="input")
            with gr.Row():
                clear_text = gr.Button(
                    value="Clear", interactive=True, scale=1
                )
                submit_text = gr.Button(
                    value="Score", interactive=True, scale=1, elem_classes="submit"
                )
        with gr.Column(scale=2):
            text_emo=gr.Textbox(label="text_emo")
            
    clear_text.click(clear_3,outputs=[input_text,text_emo])    
    submit_text.click(text_score,inputs=input_text,outputs=text_emo)
    
with gr.Blocks() as app:
    with gr.Tab("语音"):
        speech.render()
    with gr.Tab("视频"):
        video.render()
    with gr.Tab("视频集成打分"):
        video_all.render()
    with gr.Tab("语音集成打分"):
        speech_all.render()
    with gr.Tab("文本打分"):
        text_all.render()
        
app.launch()