Spaces:
Running
on
Zero
Running
on
Zero
File size: 4,110 Bytes
7985f98 0b8cb49 304180d 0b8cb49 304180d 0b8cb49 5cb0b21 054e76a ae20481 304180d 9cc0a3d 304180d 36935e6 304180d 5cb0b21 7985f98 01426a4 3e40464 054e76a ae20481 3e40464 ae20481 3e40464 0b8cb49 3ada2e5 a642e14 0b8cb49 5cb0b21 0b8cb49 3ada2e5 7985f98 0b8cb49 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import gradio as gr
from gradio_webrtc import WebRTC, AdditionalOutputs, ReplyOnPause
from pydub import AudioSegment
from io import BytesIO
import numpy as np
import librosa
import tempfile
from twilio.rest import Client
import os
import spaces
import uuid
from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
import logging
logging.basicConfig(level=logging.WARNING)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(name)s - %(levelname)s - %(message)s")
console_handler.setFormatter(formatter)
logger = logging.getLogger("gradio_webrtc")
logger.setLevel(logging.DEBUG)
logger.addHandler(console_handler)
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
auth_token = os.environ.get("TWILIO_AUTH_TOKEN")
if account_sid and auth_token:
client = Client(account_sid, auth_token)
token = client.tokens.create()
rtc_configuration = {
"iceServers": token.ice_servers,
"iceTransportPolicy": "relay",
}
else:
rtc_configuration = None
@spaces.GPU
def transcribe(audio: tuple[int, np.ndarray], transformers_convo: list[dict], gradio_convo: list[dict]):
segment = AudioSegment(audio[1].tobytes(), frame_rate=audio[0], sample_width=audio[1].dtype.itemsize, channels=1)
name = str(uuid.uuid4()) + ".mp3"
segment.export(name, format="mp3")
transformers_convo.append({"role": "user", "content": [{"type": "audio", "audio_url": name}]})
gradio_convo.append({"role": "assistant", "content": gr.Audio(value=name)})
text = processor.apply_chat_template(transformers_convo, add_generation_prompt=True, tokenize=False)
audios = []
for message in transformers_convo:
if isinstance(message["content"], list):
for ele in message["content"]:
if ele["type"] == "audio":
audios.append(librosa.load(
BytesIO(open(ele['audio_url'], "rb").read()),
sr=processor.feature_extractor.sampling_rate)[0]
)
inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
inputs = dict(**inputs)
inputs["input_ids"] = inputs["input_ids"].to("cuda:0")
generate_ids = model.generate(**inputs, max_length=256)
generate_ids = generate_ids[:, inputs["input_ids"].size(1):]
response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
print("response", response)
transformers_convo.append({"role": "assistant", "content": response})
gradio_convo.append({"role": "assistant", "content": response})
yield AdditionalOutputs(transformers_convo, gradio_convo)
with gr.Blocks() as demo:
gr.HTML(
"""
<h1 style='text-align: center'>
Talk to Qwen2Audio (Powered by WebRTC ⚡️)
</h1>
<p style='text-align: center'>
Once you grant access to your microphone, you can talk naturally to Qwen2Audio.
When you stop talking, the audio will be sent for processing.
</p>
<p style='text-align: center'>
There will be some delay in responding due to acquiring the ZeroGPU resources.
</p>
<p style='text-align: center'>
Each conversation is limited to 90 seconds. Once the time limit is up you can rejoin the conversation.
</p>
"""
)
transformers_convo = gr.State(value=[])
with gr.Row():
with gr.Column():
audio = WebRTC(
rtc_configuration=rtc_configuration,
label="Stream",
mode="send",
modality="audio",
)
with gr.Column():
transcript = gr.Chatbot(label="transcript", type="messages")
audio.stream(ReplyOnPause(yield_audio), inputs=[audio], outputs=[audio])
audio.on_additional_outputs(respond, outputs=[transformers_convo, transcript])
if __name__ == "__main__":
demo.launch() |