kehanlu commited on
Commit
cebfc07
·
1 Parent(s): d63f117

first commit

Browse files
app.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Gradio app
2
+ # A chatbot that supports Audio inputs(user can upload an audio file.)
3
+
4
+ # from transformers import AutoModel, AutoTokenizer
5
+
6
+ import gradio as gr
7
+ from transformers import AutoModel
8
+
9
+
10
+
11
+ if gr.NO_RELOAD:
12
+ model = AutoModel.from_pretrained("DeSTA-ntu/DeSTA2-8B-beta", trust_remote_code=True)
13
+ model.to("cuda")
14
+ model.eval()
15
+
16
+ def reset_chat(history, chatbot):
17
+ history = [{"role": "system", "content": "Focus on the input audio. You are a helpful voice assistant."}]
18
+ # history.clear()
19
+ return (history, None, gr.update(interactive=False), gr.update(interactive=True))
20
+
21
+ def upload_audio(history, speech, text_box, chatbot, chat_button, upload_button):
22
+ # {"role": "audio", "content": "assets/audios/DialogueEmotionClassification_DailyTalk_0196_7_1_d756.wav"},
23
+ print(speech)
24
+ if speech is None:
25
+ gr.Warning("⚠️ Please upload an audio file first!", duration=5)
26
+ return (history, speech, text_box, chatbot, chat_button, upload_button)
27
+ history.append({"role": "audio", "content": speech})
28
+ chatbot.append([f"Speech: \n\n{speech}", None])
29
+
30
+ return (
31
+ history,
32
+ gr.update(interactive=True), # speech box
33
+ gr.update(interactive=True, placeholder="Start chatting!"), # text_box,
34
+ chatbot,
35
+ gr.update(interactive=True), # chat_button,
36
+ gr.update(interactive=False) # upload_button
37
+ )
38
+
39
+ def user_send_message(history, speech, text_box, chatbot):
40
+ history.append({"role": "user", "content": text_box})
41
+ chatbot.append([f"{text_box}", None])
42
+
43
+ return (
44
+ history,
45
+ speech,
46
+ gr.update(interactive=True, placeholder="Start chatting!", value=""), # text_box,
47
+ chatbot,
48
+ )
49
+
50
+ def model_response(history, speech, text_box, chatbot):
51
+
52
+ print(history)
53
+
54
+ messages = history
55
+ generated_ids = model.chat(messages, max_new_tokens=128, do_sample=False, temperature=1.0, top_p=1.0)
56
+ response = model.tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
57
+
58
+ history.append({"role": "assistant", "content": response})
59
+ chatbot[-1][1] = response
60
+ return (
61
+ history,
62
+ speech,
63
+ gr.update(interactive=True, placeholder="Start chatting!"), # text_box,
64
+ chatbot,
65
+ )
66
+
67
+
68
+ with gr.Blocks() as demo:
69
+ gr.Markdown("# DeSTA2 demo page")
70
+ message_box = gr.Markdown(value="have fun!", label="Message")
71
+
72
+
73
+ history = gr.State([{ "role": "system", "content": "Focus on the input audio. You are a helpful voice assistant." }])
74
+ # history = gr.State([])
75
+ with gr.Row():
76
+ chatbot = gr.Chatbot(label="DeSTA2", height="100%", min_height="400px")
77
+
78
+ with gr.Row():
79
+ with gr.Column():
80
+ speech = gr.Audio(label="Audio", type="filepath", sources=["microphone", "upload"])
81
+ upload_button = gr.Button("Upload")
82
+ with gr.Column():
83
+ text_box = gr.Textbox(label="User", interactive=False, placeholder="Upload an audio first!")
84
+ chat_button = gr.Button("Send", interactive=False)
85
+
86
+ with gr.Row():
87
+ # top_p = gr.Slider(minimum=0.0, maximum=1.0, value=1.0, label="Top P")
88
+ # temperature = gr.Slider(minimum=0.0, maximum=1.0, value=1.0, label="Temperature")
89
+ gr.Button("Reset chat").click(reset_chat,
90
+ inputs=[history, chatbot],
91
+ outputs=[history, chatbot, chat_button, upload_button])
92
+
93
+ upload_button.click(upload_audio,
94
+ inputs=[history, speech, text_box, chatbot, chat_button, upload_button],
95
+ outputs=[history, speech, text_box, chatbot, chat_button, upload_button]
96
+ )
97
+ chat_button.click(user_send_message,
98
+ inputs=[history, speech, text_box, chatbot],
99
+ outputs=[history, speech, text_box, chatbot]).then(
100
+ model_response,
101
+ inputs=[history, speech, text_box, chatbot],
102
+ outputs=[history, speech, text_box, chatbot]
103
+ )
104
+
105
+ with gr.Row():
106
+ examples_prompt = gr.Examples(
107
+ examples = [
108
+ "Transcribe the speech accurately.",
109
+ "What is the primary emotion conveyed by the speaker?",
110
+ "Describe the content and tone of the audio in detail.",
111
+ "Provide a summary of the audio content.",
112
+ "Identify the language spoken in the recording.",
113
+ "What does the background noise in the audio indicate?",
114
+ "Identify if the speaker has a specific accent and describe it.",
115
+ "What is the gender and approximate age of the speaker?",
116
+ "Summarize the conversation happening in this audio.",
117
+ "Classify the type of audio: speech, music, noise, or mixed.",
118
+ "Assess the clarity and intelligibility of the speech.",
119
+ "What is the emotional state of the speaker, and why do you think so?",
120
+ "Provide a timestamped breakdown of key events in the audio."
121
+ "將這段語音轉成文字,請確保準確的時間點。",
122
+ "你能辨認出這段語音的情感是什麼嗎?",
123
+ "這段聲音中的說話者有什麼情緒?",
124
+ "從這段聲音中提取關鍵詞。",
125
+ "請翻譯這段語音的內容。",
126
+ "從這段聲音中找出說話者的性別和口音。",
127
+ ],
128
+ inputs=[text_box],
129
+ label="Example prompts"
130
+ )
131
+ with gr.Row():
132
+ examples = gr.Examples(
133
+ examples = [
134
+ ["assets/audios/0_000307.wav"],
135
+ ["assets/audios/4_0_d47.wav"],
136
+ ["assets/audios/7_1_d7.wav"],
137
+ ["assets/audios/AccentClassification_AccentdbExtended_0193_british_s01_176.wav"],
138
+ ["assets/audios/DialogueEmotionClassification_DailyTalk_0196_7_1_d756.wav"],
139
+ ["assets/audios/EmotionRecognition_MultimodalEmotionlinesDataset_0026_dia382_utt0.wav"],
140
+ ["assets/audios/LanguageIdentification_VoxForge_0000_de143-43.flac"],
141
+ ["assets/audios/MUL0608_120.98_148.92.wav"],
142
+ ["assets/audios/NoiseDetection_LJSpeech_MUSAN-Music_0199_music_LJSpeech-1.1_16k_LJ050-0033.wav"],
143
+ ["assets/audios/Ses01F_script03_1_F029.wav"],
144
+ ["assets/audios/Ses01M_script01_1_F014.wav"],
145
+ ["assets/audios/Ses04F_impro02_M004.wav"],
146
+ ["assets/audios/SpeakerVerification_LibriSpeech-TestClean_0046_3575-170457-0038.flac"],
147
+ ["assets/audios/SpeechTextMatching_LJSpeech_0001_LJ001-0107.wav"],
148
+ ["assets/audios/common_voice_en_34980360.mp3"],
149
+ ["assets/audios/p284_159.wav"],
150
+ ["assets/audios/p287_162.wav"]
151
+ ],
152
+ inputs=[speech],
153
+ label="Example audios"
154
+ )
155
+
156
+ if __name__ == "__main__":
157
+ demo.launch(share=True)
assets/audios/0_000307.wav ADDED
Binary file (378 kB). View file
 
assets/audios/4_0_d47.wav ADDED
Binary file (370 kB). View file
 
assets/audios/7_1_d7.wav ADDED
Binary file (228 kB). View file
 
assets/audios/AccentClassification_AccentdbExtended_0193_british_s01_176.wav ADDED
Binary file (150 kB). View file
 
assets/audios/DialogueEmotionClassification_DailyTalk_0196_7_1_d756.wav ADDED
Binary file (391 kB). View file
 
assets/audios/EmotionRecognition_MultimodalEmotionlinesDataset_0026_dia382_utt0.wav ADDED
Binary file (289 kB). View file
 
assets/audios/LanguageIdentification_VoxForge_0000_de143-43.flac ADDED
Binary file (53.3 kB). View file
 
assets/audios/MUL0608_120.98_148.92.wav ADDED
Binary file (447 kB). View file
 
assets/audios/NoiseDetection_LJSpeech_MUSAN-Music_0199_music_LJSpeech-1.1_16k_LJ050-0033.wav ADDED
Binary file (129 kB). View file
 
assets/audios/Ses01F_script03_1_F029.wav ADDED
Binary file (391 kB). View file
 
assets/audios/Ses01M_script01_1_F014.wav ADDED
Binary file (201 kB). View file
 
assets/audios/Ses04F_impro02_M004.wav ADDED
Binary file (277 kB). View file
 
assets/audios/SpeakerVerification_LibriSpeech-TestClean_0046_3575-170457-0038.flac ADDED
Binary file (412 kB). View file
 
assets/audios/SpeechTextMatching_LJSpeech_0001_LJ001-0107.wav ADDED
Binary file (162 kB). View file
 
assets/audios/common_voice_en_34980360.mp3 ADDED
Binary file (37.4 kB). View file
 
assets/audios/p284_159.wav ADDED
Binary file (238 kB). View file
 
assets/audios/p287_162.wav ADDED
Binary file (397 kB). View file
 
assets/css/styles.css ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ td audio {
2
+ width: 100%;
3
+ max-width: 300px;
4
+ }
5
+ td{
6
+ min-width: 200px;
7
+ }
8
+
9
+ @media (max-width: 768px) {
10
+ audio {
11
+ max-width: 300px;
12
+ }
13
+
14
+ }
15
+
16
+ @media (max-width: 480px) {
17
+ audio {
18
+ max-width: 200px;
19
+ }
20
+ }
assets/images/dataset_construction.png ADDED
assets/images/figure1.png ADDED
assets/images/method.png ADDED
assets/images/model_training.png ADDED