freddyaboulton HF staff commited on
Commit
ae20481
1 Parent(s): 8d9f39e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +27 -26
app.py CHANGED
@@ -8,6 +8,7 @@ import tempfile
8
  from twilio.rest import Client
9
  import os
10
  import spaces
 
11
  from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
12
  import logging
13
 
@@ -51,32 +52,32 @@ else:
51
  def transcribe(audio: tuple[int, np.ndarray], transformers_convo: list[dict], gradio_convo: list[dict]):
52
  segment = AudioSegment(audio[1].tobytes(), frame_rate=audio[0], sample_width=audio[1].dtype.itemsize, channels=1)
53
 
54
- with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_audio:
55
- segment.export(temp_audio.name, format="mp3")
56
- transformers_convo.append({"role": "user", "content": [{"type": "audio", "audio_url": temp_audio.name}]})
57
- gradio_convo.append({"role": "assistant", "content": gr.Audio(value=temp_audio.name)})
58
- text = processor.apply_chat_template(transformers_convo, add_generation_prompt=True, tokenize=False)
59
- audios = []
60
- for message in transformers_convo:
61
- if isinstance(message["content"], list):
62
- for ele in message["content"]:
63
- if ele["type"] == "audio":
64
- audios.append(librosa.load(
65
- BytesIO(open(ele['audio_url'], "rb").read()),
66
- sr=processor.feature_extractor.sampling_rate)[0]
67
- )
68
- inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
69
- inputs = dict(**inputs)
70
- inputs["input_ids"] = inputs["input_ids"].to("cuda:0")
71
-
72
- generate_ids = model.generate(**inputs, max_length=256)
73
- generate_ids = generate_ids[:, inputs["input_ids"].size(1):]
74
- response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
75
- print("response", response)
76
- transformers_convo.append({"role": "assistant", "content": response})
77
- gradio_convo.append({"role": "assistant", "content": response})
78
-
79
- yield AdditionalOutputs(transformers_convo, gradio_convo)
80
 
81
 
82
  with gr.Blocks() as demo:
 
8
  from twilio.rest import Client
9
  import os
10
  import spaces
11
+ import uuid
12
  from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
13
  import logging
14
 
 
52
  def transcribe(audio: tuple[int, np.ndarray], transformers_convo: list[dict], gradio_convo: list[dict]):
53
  segment = AudioSegment(audio[1].tobytes(), frame_rate=audio[0], sample_width=audio[1].dtype.itemsize, channels=1)
54
 
55
+ name = str(uuid.uuid4()) + ".mp3"
56
+ segment.export(name, format="mp3")
57
+ transformers_convo.append({"role": "user", "content": [{"type": "audio", "audio_url": name}]})
58
+ gradio_convo.append({"role": "assistant", "content": gr.Audio(value=name)})
59
+ text = processor.apply_chat_template(transformers_convo, add_generation_prompt=True, tokenize=False)
60
+ audios = []
61
+ for message in transformers_convo:
62
+ if isinstance(message["content"], list):
63
+ for ele in message["content"]:
64
+ if ele["type"] == "audio":
65
+ audios.append(librosa.load(
66
+ BytesIO(open(ele['audio_url'], "rb").read()),
67
+ sr=processor.feature_extractor.sampling_rate)[0]
68
+ )
69
+ inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
70
+ inputs = dict(**inputs)
71
+ inputs["input_ids"] = inputs["input_ids"].to("cuda:0")
72
+
73
+ generate_ids = model.generate(**inputs, max_length=256)
74
+ generate_ids = generate_ids[:, inputs["input_ids"].size(1):]
75
+ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
76
+ print("response", response)
77
+ transformers_convo.append({"role": "assistant", "content": response})
78
+ gradio_convo.append({"role": "assistant", "content": response})
79
+
80
+ yield AdditionalOutputs(transformers_convo, gradio_convo)
81
 
82
 
83
  with gr.Blocks() as demo: