File size: 4,161 Bytes
7985f98
0b8cb49
 
304180d
0b8cb49
304180d
0b8cb49
5cb0b21
 
054e76a
ae20481
304180d
9cc0a3d
304180d
36935e6
 
 
 
 
 
 
 
 
304180d
 
 
5cb0b21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7985f98
01426a4
3e40464
054e76a
ae20481
 
 
c822b84
ae20481
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3e40464
ae20481
 
 
3e40464
0b8cb49
 
 
3ada2e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a642e14
0b8cb49
 
 
5cb0b21
0b8cb49
 
 
 
 
 
 
9cf5061
3738456
7985f98
0b8cb49
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import gradio as gr
from gradio_webrtc import WebRTC, AdditionalOutputs, ReplyOnPause
from pydub import AudioSegment
from io import BytesIO
import numpy as np
import librosa
import tempfile
from twilio.rest import Client
import os
import spaces
import uuid
from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
import logging

logging.basicConfig(level=logging.WARNING)
console_handler = logging.StreamHandler()
console_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter("%(name)s - %(levelname)s - %(message)s")
console_handler.setFormatter(formatter)
logger = logging.getLogger("gradio_webrtc")
logger.setLevel(logging.DEBUG)
logger.addHandler(console_handler)

processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")

account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
auth_token = os.environ.get("TWILIO_AUTH_TOKEN")

if account_sid and auth_token:
    client = Client(account_sid, auth_token)

    token = client.tokens.create()

    rtc_configuration = {
        "iceServers": token.ice_servers,
        "iceTransportPolicy": "relay",
    }
else:
    rtc_configuration = None


@spaces.GPU
def transcribe(audio: tuple[int, np.ndarray], transformers_convo: list[dict], gradio_convo: list[dict]):
    segment = AudioSegment(audio[1].tobytes(), frame_rate=audio[0], sample_width=audio[1].dtype.itemsize, channels=1)
    name = str(uuid.uuid4()) + ".mp3"
    segment.export(name, format="mp3")
    transformers_convo.append({"role": "user", "content": [{"type": "audio", "audio_url": name}]})
    gradio_convo.append({"role": "user", "content": gr.Audio(value=name)})
    text = processor.apply_chat_template(transformers_convo, add_generation_prompt=True, tokenize=False)
    audios = []
    for message in transformers_convo:
        if isinstance(message["content"], list):
            for ele in message["content"]:
                if ele["type"] == "audio":
                    audios.append(librosa.load(
                        BytesIO(open(ele['audio_url'], "rb").read()), 
                        sr=processor.feature_extractor.sampling_rate)[0]
                    )
    inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
    inputs = dict(**inputs)
    inputs["input_ids"] = inputs["input_ids"].to("cuda:0")

    generate_ids = model.generate(**inputs, max_length=256)
    generate_ids = generate_ids[:, inputs["input_ids"].size(1):]
    response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    print("response", response)
    transformers_convo.append({"role": "assistant", "content": response})
    gradio_convo.append({"role": "assistant", "content": response})

    yield AdditionalOutputs(transformers_convo, gradio_convo)


with gr.Blocks() as demo:
    gr.HTML(
    """
    <h1 style='text-align: center'>
    Talk to Qwen2Audio (Powered by WebRTC ⚡️)
    </h1>
    <p style='text-align: center'>
    Once you grant access to your microphone, you can talk naturally to Qwen2Audio.
    When you stop talking, the audio will be sent for processing.
    </p>
    <p style='text-align: center'>
    There will be some delay in responding due to acquiring the ZeroGPU resources.
    </p>
    <p style='text-align: center'>
    Each conversation is limited to 90 seconds. Once the time limit is up you can rejoin the conversation.
    </p>
    """
    )
    transformers_convo = gr.State(value=[])
    with gr.Row():
        with gr.Column():
            audio = WebRTC(
                rtc_configuration=rtc_configuration,
                label="Stream",
                mode="send",
                modality="audio",
            )
        with gr.Column():
            transcript = gr.Chatbot(label="transcript", type="messages")

    audio.stream(ReplyOnPause(transcribe), inputs=[audio, transformers_convo, transcript], outputs=[audio], time_limit=90)
    audio.on_additional_outputs(lambda s,a: (s,a), outputs=[transformers_convo, transcript])

if __name__ == "__main__":
    demo.launch()