Spaces:
Running
on
Zero
Running
on
Zero
File size: 3,271 Bytes
7985f98 0b8cb49 304180d 0b8cb49 304180d 0b8cb49 5cb0b21 054e76a 304180d 5cb0b21 7985f98 054e76a 0b8cb49 054e76a 0b8cb49 5cb0b21 0b8cb49 7985f98 0b8cb49 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 |
import gradio as gr
from gradio_webrtc import WebRTC, AdditionalOutputs, ReplyOnPause
from pydub import AudioSegment
from io import BytesIO
import numpy as np
import librosa
import tempfile
from twilio.rest import Client
import os
import spaces
from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
auth_token = os.environ.get("TWILIO_AUTH_TOKEN")
if account_sid and auth_token:
client = Client(account_sid, auth_token)
token = client.tokens.create()
rtc_configuration = {
"iceServers": token.ice_servers,
"iceTransportPolicy": "relay",
}
else:
rtc_configuration = None
@spaces.GPU
def transcribe(audio: tuple[int, np.ndarray], transformers_convo: list[dict], gradio_convo: list[dict]):
segment = AudioSegment(audio[1].tobytes(), frame_rate=audio[0], sample_width=audio[1].dtype.itemsize, channels=1)
with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_audio:
segment.export(temp_audio.name, format="mp3")
transformers_convo.append({"role": "user", "content": [{"type": "audio", "audio_url": temp_audio.name}]})
gradio_convo.append({"role": "assistant", "content": gr.Audio(value=temp_audio.name)})
text = processor.apply_chat_template(transformers_convo, add_generation_prompt=True, tokenize=False)
audios = []
for message in transformers_convo:
if isinstance(message["content"], list):
for ele in message["content"]:
if ele["type"] == "audio":
audios.append(librosa.load(
BytesIO(open(ele['audio_url'], "rb").read()),
sr=processor.feature_extractor.sampling_rate)[0]
)
inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
inputs.input_ids = inputs.input_ids.to("cuda")
generate_ids = model.generate(**inputs, max_length=256)
generate_ids = generate_ids[:, inputs.input_ids.size(1):]
response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
print("response", response)
transformers_convo.append({"role": "assistant", "content": response})
gradio_convo.append({"role": "assistant", "content": response})
yield AdditionalOutputs(transformers_convo, gradio_convo)
with gr.Blocks() as demo:
transformers_convo = gr.State()
with gr.Row():
with gr.Column():
audio = WebRTC(
rtc_configuration=rtc_configuration,
label="Stream",
mode="send",
modality="audio",
)
with gr.Column():
transcript = gr.Chatbot(label="transcript", type="messages")
audio.stream(ReplyOnPause(transcribe), inputs=[audio, transformers_convo, transcript], outputs=[audio])
audio.on_additional_outputs(lambda s: s, outputs=[transformers_convo, transcript])
if __name__ == "__main__":
demo.launch() |