import gradio as gr from gradio_webrtc import WebRTC, AdditionalOutputs, ReplyOnPause from pydub import AudioSegment from io import BytesIO import numpy as np import librosa import tempfile from twilio.rest import Client import os import spaces from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor import logging # Configure the root logger to WARNING to suppress debug messages from other libraries logging.basicConfig(level=logging.WARNING) # Create a console handler console_handler = logging.StreamHandler() console_handler.setLevel(logging.DEBUG) # Create a formatter formatter = logging.Formatter("%(name)s - %(levelname)s - %(message)s") console_handler.setFormatter(formatter) # Configure the logger for your specific library logger = logging.getLogger("gradio_webrtc") logger.setLevel(logging.DEBUG) logger.addHandler(console_handler) processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct") model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto") account_sid = os.environ.get("TWILIO_ACCOUNT_SID") auth_token = os.environ.get("TWILIO_AUTH_TOKEN") if account_sid and auth_token: client = Client(account_sid, auth_token) token = client.tokens.create() rtc_configuration = { "iceServers": token.ice_servers, "iceTransportPolicy": "relay", } else: rtc_configuration = None @spaces.GPU def transcribe(audio: tuple[int, np.ndarray], transformers_convo: list[dict], gradio_convo: list[dict]): segment = AudioSegment(audio[1].tobytes(), frame_rate=audio[0], sample_width=audio[1].dtype.itemsize, channels=1) with tempfile.NamedTemporaryFile(suffix=".mp3") as temp_audio: segment.export(temp_audio.name, format="mp3") transformers_convo.append({"role": "user", "content": [{"type": "audio", "audio_url": temp_audio.name}]}) gradio_convo.append({"role": "assistant", "content": gr.Audio(value=temp_audio.name)}) text = processor.apply_chat_template(transformers_convo, add_generation_prompt=True, tokenize=False) audios = [] for message in transformers_convo: if isinstance(message["content"], list): for ele in message["content"]: if ele["type"] == "audio": audios.append(librosa.load( BytesIO(open(ele['audio_url'], "rb").read()), sr=processor.feature_extractor.sampling_rate)[0] ) inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True) inputs = dict(**inputs) inputs["input_ids"] = inputs["input_ids"].to("cuda:0") generate_ids = model.generate(**inputs, max_length=256) generate_ids = generate_ids[:, inputs["input_ids"].size(1):] response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] print("response", response) transformers_convo.append({"role": "assistant", "content": response}) gradio_convo.append({"role": "assistant", "content": response}) yield AdditionalOutputs(transformers_convo, gradio_convo) with gr.Blocks() as demo: transformers_convo = gr.State(value=[]) with gr.Row(): with gr.Column(): audio = WebRTC( rtc_configuration=rtc_configuration, label="Stream", mode="send", modality="audio", ) with gr.Column(): transcript = gr.Chatbot(label="transcript", type="messages") audio.stream(ReplyOnPause(transcribe), inputs=[audio, transformers_convo, transcript], outputs=[audio]) audio.on_additional_outputs(lambda s,a: (s,a), outputs=[transformers_convo, transcript]) if __name__ == "__main__": demo.launch()