gpt-omni commited on
Commit
41c7b36
1 Parent(s): aab4898

no streaming

Browse files
Files changed (1) hide show
  1. app.py +16 -4
app.py CHANGED
@@ -39,6 +39,7 @@ from litgpt.generate.base import sample
39
 
40
  device = "cuda" if torch.cuda.is_available() else "cpu"
41
  ckpt_dir = "./checkpoint"
 
42
 
43
 
44
  OUT_CHUNK = 4096
@@ -236,6 +237,7 @@ def run_AT_batch_stream(
236
  nums_generate = stream_stride
237
  begin_generate = False
238
  current_index = 0
 
239
  for _ in tqdm(range(2, max_returned_tokens - T + 1)):
240
  tokens_A, token_T = next_token_batch(
241
  model,
@@ -278,7 +280,7 @@ def run_AT_batch_stream(
278
  if index == 7:
279
  begin_generate = True
280
 
281
- if begin_generate:
282
  current_index += 1
283
  if current_index == nums_generate:
284
  current_index = 0
@@ -288,10 +290,17 @@ def run_AT_batch_stream(
288
 
289
  input_pos = input_pos.add_(1)
290
  index += 1
 
 
291
  text = text_tokenizer.decode(torch.tensor(list_output[-1]))
292
  print(f"text output: {text}")
293
  model.clear_kv_cache()
294
- return list_output
 
 
 
 
 
295
 
296
 
297
  for chunk in run_AT_batch_stream('./data/samples/output1.wav'):
@@ -313,13 +322,16 @@ def process_audio(audio):
313
  cnt += 1
314
  audio_data = np.frombuffer(chunk, dtype=np.int16)
315
  audio_data = audio_data.reshape(-1, OUT_CHANNELS)
316
- yield OUT_RATE, audio_data.astype(np.int16)
 
 
 
317
 
318
 
319
  demo = gr.Interface(
320
  process_audio,
321
  inputs=gr.Audio(type="filepath", label="Microphone"),
322
- outputs=[gr.Audio(label="Response", streaming=True, autoplay=True)],
323
  title="Chat Mini-Omni Demo",
324
  # live=True,
325
  )
 
39
 
40
  device = "cuda" if torch.cuda.is_available() else "cpu"
41
  ckpt_dir = "./checkpoint"
42
+ streaming_output = False
43
 
44
 
45
  OUT_CHUNK = 4096
 
237
  nums_generate = stream_stride
238
  begin_generate = False
239
  current_index = 0
240
+ total_num = 0
241
  for _ in tqdm(range(2, max_returned_tokens - T + 1)):
242
  tokens_A, token_T = next_token_batch(
243
  model,
 
280
  if index == 7:
281
  begin_generate = True
282
 
283
+ if begin_generate and streaming_output:
284
  current_index += 1
285
  if current_index == nums_generate:
286
  current_index = 0
 
290
 
291
  input_pos = input_pos.add_(1)
292
  index += 1
293
+ total_num += 1
294
+
295
  text = text_tokenizer.decode(torch.tensor(list_output[-1]))
296
  print(f"text output: {text}")
297
  model.clear_kv_cache()
298
+ if not streaming_output:
299
+ snac = get_snac(list_output, 7, total_num-7)
300
+ audio_stream = generate_audio_data(snac, snacmodel, device)
301
+ return audio_stream
302
+
303
+ # return list_output
304
 
305
 
306
  for chunk in run_AT_batch_stream('./data/samples/output1.wav'):
 
322
  cnt += 1
323
  audio_data = np.frombuffer(chunk, dtype=np.int16)
324
  audio_data = audio_data.reshape(-1, OUT_CHANNELS)
325
+ if streaming_output:
326
+ yield OUT_RATE, audio_data.astype(np.int16)
327
+ else:
328
+ return OUT_RATE, audio_data.astype(np.int16)
329
 
330
 
331
  demo = gr.Interface(
332
  process_audio,
333
  inputs=gr.Audio(type="filepath", label="Microphone"),
334
+ outputs=[gr.Audio(label="Response", streaming=streaming_output, autoplay=True)],
335
  title="Chat Mini-Omni Demo",
336
  # live=True,
337
  )