Spaces:
Running
Running
gpt-omni
commited on
Commit
•
41c7b36
1
Parent(s):
aab4898
no streaming
Browse files
app.py
CHANGED
@@ -39,6 +39,7 @@ from litgpt.generate.base import sample
|
|
39 |
|
40 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
41 |
ckpt_dir = "./checkpoint"
|
|
|
42 |
|
43 |
|
44 |
OUT_CHUNK = 4096
|
@@ -236,6 +237,7 @@ def run_AT_batch_stream(
|
|
236 |
nums_generate = stream_stride
|
237 |
begin_generate = False
|
238 |
current_index = 0
|
|
|
239 |
for _ in tqdm(range(2, max_returned_tokens - T + 1)):
|
240 |
tokens_A, token_T = next_token_batch(
|
241 |
model,
|
@@ -278,7 +280,7 @@ def run_AT_batch_stream(
|
|
278 |
if index == 7:
|
279 |
begin_generate = True
|
280 |
|
281 |
-
if begin_generate:
|
282 |
current_index += 1
|
283 |
if current_index == nums_generate:
|
284 |
current_index = 0
|
@@ -288,10 +290,17 @@ def run_AT_batch_stream(
|
|
288 |
|
289 |
input_pos = input_pos.add_(1)
|
290 |
index += 1
|
|
|
|
|
291 |
text = text_tokenizer.decode(torch.tensor(list_output[-1]))
|
292 |
print(f"text output: {text}")
|
293 |
model.clear_kv_cache()
|
294 |
-
|
|
|
|
|
|
|
|
|
|
|
295 |
|
296 |
|
297 |
for chunk in run_AT_batch_stream('./data/samples/output1.wav'):
|
@@ -313,13 +322,16 @@ def process_audio(audio):
|
|
313 |
cnt += 1
|
314 |
audio_data = np.frombuffer(chunk, dtype=np.int16)
|
315 |
audio_data = audio_data.reshape(-1, OUT_CHANNELS)
|
316 |
-
|
|
|
|
|
|
|
317 |
|
318 |
|
319 |
demo = gr.Interface(
|
320 |
process_audio,
|
321 |
inputs=gr.Audio(type="filepath", label="Microphone"),
|
322 |
-
outputs=[gr.Audio(label="Response", streaming=
|
323 |
title="Chat Mini-Omni Demo",
|
324 |
# live=True,
|
325 |
)
|
|
|
39 |
|
40 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
41 |
ckpt_dir = "./checkpoint"
|
42 |
+
streaming_output = False
|
43 |
|
44 |
|
45 |
OUT_CHUNK = 4096
|
|
|
237 |
nums_generate = stream_stride
|
238 |
begin_generate = False
|
239 |
current_index = 0
|
240 |
+
total_num = 0
|
241 |
for _ in tqdm(range(2, max_returned_tokens - T + 1)):
|
242 |
tokens_A, token_T = next_token_batch(
|
243 |
model,
|
|
|
280 |
if index == 7:
|
281 |
begin_generate = True
|
282 |
|
283 |
+
if begin_generate and streaming_output:
|
284 |
current_index += 1
|
285 |
if current_index == nums_generate:
|
286 |
current_index = 0
|
|
|
290 |
|
291 |
input_pos = input_pos.add_(1)
|
292 |
index += 1
|
293 |
+
total_num += 1
|
294 |
+
|
295 |
text = text_tokenizer.decode(torch.tensor(list_output[-1]))
|
296 |
print(f"text output: {text}")
|
297 |
model.clear_kv_cache()
|
298 |
+
if not streaming_output:
|
299 |
+
snac = get_snac(list_output, 7, total_num-7)
|
300 |
+
audio_stream = generate_audio_data(snac, snacmodel, device)
|
301 |
+
return audio_stream
|
302 |
+
|
303 |
+
# return list_output
|
304 |
|
305 |
|
306 |
for chunk in run_AT_batch_stream('./data/samples/output1.wav'):
|
|
|
322 |
cnt += 1
|
323 |
audio_data = np.frombuffer(chunk, dtype=np.int16)
|
324 |
audio_data = audio_data.reshape(-1, OUT_CHANNELS)
|
325 |
+
if streaming_output:
|
326 |
+
yield OUT_RATE, audio_data.astype(np.int16)
|
327 |
+
else:
|
328 |
+
return OUT_RATE, audio_data.astype(np.int16)
|
329 |
|
330 |
|
331 |
demo = gr.Interface(
|
332 |
process_audio,
|
333 |
inputs=gr.Audio(type="filepath", label="Microphone"),
|
334 |
+
outputs=[gr.Audio(label="Response", streaming=streaming_output, autoplay=True)],
|
335 |
title="Chat Mini-Omni Demo",
|
336 |
# live=True,
|
337 |
)
|