Helw150 commited on
Commit
5279276
1 Parent(s): 8aaf9c8
Files changed (1) hide show
  1. app.py +41 -20
app.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import time
2
  import traceback
3
  from dataclasses import dataclass, field
@@ -5,6 +7,7 @@ from dataclasses import dataclass, field
5
  import gradio as gr
6
  import librosa
7
  import numpy as np
 
8
  import soundfile as sf
9
  import spaces
10
  import torch
@@ -12,7 +15,8 @@ import xxhash
12
  from datasets import Audio
13
  from transformers import AutoModel
14
  from transformers.modeling_outputs import CausalLMOutputWithPast
15
- import io
 
16
 
17
  if gr.NO_RELOAD:
18
  diva_model = AutoModel.from_pretrained(
@@ -48,10 +52,8 @@ def diva_audio(audio_input, do_sample=False, temperature=0.001, prev_outs=None):
48
 
49
  @dataclass
50
  class AppState:
51
- stream: np.ndarray | None = None
52
- sampling_rate: int = 0
53
- stopped: bool = False
54
  conversation: list = field(default_factory=list)
 
55
  model_outs: any = None
56
 
57
 
@@ -63,16 +65,16 @@ def process_audio(audio: tuple, state: AppState):
63
  def response(state: AppState, audio: tuple):
64
  if not audio:
65
  return AppState()
66
- state.stream = audio[1]
67
- state.sampling_rate = audio[0]
68
 
69
- file_name = f"/tmp/{xxhash.xxh32(bytes(state.stream)).hexdigest()}.wav"
70
 
71
- sf.write(file_name, state.stream, state.sampling_rate, format="wav")
72
 
73
  state.conversation.append(
74
  {"role": "user", "content": {"path": file_name, "mime_type": "audio/wav"}}
75
  )
 
 
76
  if spaces.config.Config.zero_gpu:
77
  if state.model_outs is not None:
78
  state.model_outs = tuple(
@@ -88,18 +90,23 @@ def response(state: AppState, audio: tuple):
88
  causal_outs = state.model_outs
89
  state.model_outs = None
90
  prev_outs = causal_outs
91
- start = False
92
  for resp, outs in diva_audio(
93
- (state.sampling_rate, state.stream),
94
  prev_outs=(prev_outs if prev_outs is not None else None),
95
  ):
96
- if not start:
97
- state.conversation.append({"role": "assistant", "content": resp})
98
- start = True
99
- else:
100
- state.conversation[-1]["content"] = resp
101
- print(resp)
102
- yield state, state.conversation
 
 
 
 
 
103
 
104
  del outs.logits
105
  del outs.hidden_states
@@ -107,9 +114,21 @@ def response(state: AppState, audio: tuple):
107
  outs = tuple(
108
  tuple(vec.cpu().numpy() for vec in tup) for tup in outs.past_key_values
109
  )
 
 
 
 
 
 
 
 
 
 
 
110
  yield (
111
  AppState(conversation=state.conversation, model_outs=outs),
112
  state.conversation,
 
113
  )
114
 
115
 
@@ -190,6 +209,8 @@ with gr.Blocks(theme=theme, js=js) as demo:
190
  )
191
  with gr.Row():
192
  chatbot = gr.Chatbot(label="Conversation", type="messages")
 
 
193
  state = gr.State(value=AppState())
194
  stream = input_audio.start_recording(
195
  process_audio,
@@ -197,15 +218,15 @@ with gr.Blocks(theme=theme, js=js) as demo:
197
  [input_audio, state],
198
  )
199
  respond = input_audio.stop_recording(
200
- response, [state, input_audio], [state, chatbot]
201
  )
202
- restart = respond.success(start_recording_user, [state], [input_audio]).then(
203
  lambda state: state, state, state, js=js_reset
204
  )
205
 
206
  cancel = gr.Button("Restart Conversation", variant="stop")
207
  cancel.click(
208
- lambda: (AppState(stopped=True), gr.Audio(recording=False)),
209
  None,
210
  [state, input_audio],
211
  cancels=[respond, restart],
 
1
+ import io
2
+ import os
3
  import time
4
  import traceback
5
  from dataclasses import dataclass, field
 
7
  import gradio as gr
8
  import librosa
9
  import numpy as np
10
+ import pvorca
11
  import soundfile as sf
12
  import spaces
13
  import torch
 
15
  from datasets import Audio
16
  from transformers import AutoModel
17
  from transformers.modeling_outputs import CausalLMOutputWithPast
18
+
19
+ orca = pvorca.create(access_key=os.environ.get("ORCA_KEY"))
20
 
21
  if gr.NO_RELOAD:
22
  diva_model = AutoModel.from_pretrained(
 
52
 
53
  @dataclass
54
  class AppState:
 
 
 
55
  conversation: list = field(default_factory=list)
56
+ stopped: bool = False
57
  model_outs: any = None
58
 
59
 
 
65
  def response(state: AppState, audio: tuple):
66
  if not audio:
67
  return AppState()
 
 
68
 
69
+ file_name = f"/tmp/{xxhash.xxh32(bytes(audio[1])).hexdigest()}.wav"
70
 
71
+ sf.write(file_name, audio[1], audio[0], format="wav")
72
 
73
  state.conversation.append(
74
  {"role": "user", "content": {"path": file_name, "mime_type": "audio/wav"}}
75
  )
76
+ state.conversation.append({"role": "assistant", "content": ""})
77
+ yield state, state.conversation, None
78
  if spaces.config.Config.zero_gpu:
79
  if state.model_outs is not None:
80
  state.model_outs = tuple(
 
90
  causal_outs = state.model_outs
91
  state.model_outs = None
92
  prev_outs = causal_outs
93
+ stream = orca.stream_open()
94
  for resp, outs in diva_audio(
95
+ (audio[0], audio[1]),
96
  prev_outs=(prev_outs if prev_outs is not None else None),
97
  ):
98
+ prev_resp = state.conversation[-1]["content"]
99
+ state.conversation[-1]["content"] = resp
100
+ pcm = stream.synthesize(resp[len(prev_resp) :])
101
+ audio_chunk = None
102
+ if pcm is not None:
103
+ mp3_io = io.BytesIO()
104
+ sf.write(
105
+ mp3_io, np.asarray(pcm).astype(np.int16), orca.sample_rate, format="mp3"
106
+ )
107
+ audio_chunk = mp3_io.getvalue()
108
+ mp3_io.close()
109
+ yield state, state.conversation, audio_chunk
110
 
111
  del outs.logits
112
  del outs.hidden_states
 
114
  outs = tuple(
115
  tuple(vec.cpu().numpy() for vec in tup) for tup in outs.past_key_values
116
  )
117
+ audio_chunk = None
118
+ pcm = stream.flush()
119
+ if pcm is not None:
120
+ audio_chunk = np.asarray(pcm).tobytes()
121
+ mp3_io = io.BytesIO()
122
+ sf.write(
123
+ mp3_io, np.asarray(pcm).astype(np.int16), orca.sample_rate, format="mp3"
124
+ )
125
+ audio_chunk = mp3_io.getvalue()
126
+ mp3_io.close()
127
+ stream.close()
128
  yield (
129
  AppState(conversation=state.conversation, model_outs=outs),
130
  state.conversation,
131
+ audio_chunk,
132
  )
133
 
134
 
 
209
  )
210
  with gr.Row():
211
  chatbot = gr.Chatbot(label="Conversation", type="messages")
212
+ with gr.Row():
213
+ output_audio = gr.Audio(label="Output Audio", streaming=True, autoplay=True)
214
  state = gr.State(value=AppState())
215
  stream = input_audio.start_recording(
216
  process_audio,
 
218
  [input_audio, state],
219
  )
220
  respond = input_audio.stop_recording(
221
+ response, [state, input_audio], [state, chatbot, output_audio]
222
  )
223
+ restart = output_audio.stop(start_recording_user, [state], [input_audio]).then(
224
  lambda state: state, state, state, js=js_reset
225
  )
226
 
227
  cancel = gr.Button("Restart Conversation", variant="stop")
228
  cancel.click(
229
+ lambda: (AppState(), gr.Audio(recording=False)),
230
  None,
231
  [state, input_audio],
232
  cancels=[respond, restart],