Talk to Smolvox Smollm2 (Powered by WebRTC ⚡️)

import gradio as gr
from gradio_webrtc import WebRTC, ReplyOnPause, AdditionalOutputs
import transformers
import numpy as np
from twilio.rest import Client
import os
import torch
import librosa
import spaces


pipe = transformers.pipeline(
    model="reach-vb/smolvox-smollm2-whisper-turbo",
    trust_remote_code=True,
    device=torch.device("cuda"),
)
whisper = transformers.pipeline(
    model="openai/whisper-large-v3-turbo", device=torch.device("cuda")
)

account_sid = os.environ.get("TWILIO_ACCOUNT_SID")
auth_token = os.environ.get("TWILIO_AUTH_TOKEN")

if account_sid and auth_token:
    client = Client(account_sid, auth_token)

    token = client.tokens.create()

    rtc_configuration = {
        "iceServers": token.ice_servers,
        "iceTransportPolicy": "relay",
    }
else:
    rtc_configuration = None

@spaces.GPU(duration=90)
def transcribe(
    audio: tuple[int, np.ndarray],
    transformers_chat: list[dict],
    conversation: list[dict],
):
    original_sr = audio[0]
    target_sr = 16000

    audio_sr = librosa.resample(
        audio[1].astype(np.float32) / 32768.0, orig_sr=original_sr, target_sr=target_sr
    )

    tf_input = [d for d in transformers_chat]

    # Generate response from the pipeline using the audio input
    output = pipe(
        {"audio": audio_sr, "turns": tf_input, "sampling_rate": target_sr},
        max_new_tokens=512,
    )
    # Transcribe the audio using Whisper
    transcription = whisper({"array": audio_sr.squeeze(), "sampling_rate": target_sr})

    # Update both conversation histories
    conversation.append({"role": "user", "content": transcription["text"]})
    conversation.append({"role": "assistant", "content": output})
    transformers_chat.append({"role": "user", "content": transcription["text"]})
    transformers_chat.append({"role": "assistant", "content": output})

    yield AdditionalOutputs(transformers_chat, conversation)

@spaces.GPU(duration=90)
def respond_text(
    user_text: str,
    transformers_chat: list[dict],
    conversation: list[dict],
):
    if not user_text.strip():
        # Do nothing if the textbox is empty
        return transformers_chat, conversation

    # Append the user message from the textbox
    conversation.append({"role": "user", "content": user_text})
    transformers_chat.append({"role": "user", "content": user_text})

    # Generate a response using the pipeline.
    # Here we assume the pipeline can also process text input via the "text" key.
    output = pipe({"text": user_text, "turns": transformers_chat}, max_new_tokens=512)

    conversation.append({"role": "assistant", "content": output})
    transformers_chat.append({"role": "assistant", "content": output})
    return transformers_chat, conversation


with gr.Blocks() as demo:
    gr.HTML(
        """
        <h1 style='text-align: center'>
            Talk to Smolvox Smollm2 (Powered by WebRTC ⚡️)
        </h1>
        <p style='text-align: center'>
            Once you grant access to your microphone, you can talk naturally to Ultravox.
            When you stop talking, the audio will be sent for processing.
        </p>
        <p style='text-align: center'>
            Each conversation is limited to 90 seconds. Once the time limit is up you can rejoin the conversation.
        </p>
        """
    )
    # Shared conversation state
    transformers_chat = gr.State(
        value=[
            {
                "role": "system",
                "content": "You are a friendly and helpful character. You love to answer questions for people.",
            }
        ]
    )

    with gr.Row():
        with gr.Column(scale=1):
            transcript = gr.Chatbot(label="Transcript", type="messages")
            text_input = gr.Textbox(
                placeholder="Type your message here...", label="Your Message"
            )
            send_button = gr.Button("Send")
        with gr.Column(scale=1):
            audio = WebRTC(
                rtc_configuration=rtc_configuration,
                label="Stream",
                mode="send",
                modality="audio",
            )

    # Audio stream: when you stop speaking, process the audio input.
    audio.stream(
        ReplyOnPause(transcribe),
        inputs=[audio, transformers_chat, transcript],
        outputs=[audio],
        time_limit=90,
    )
    audio.on_additional_outputs(
        lambda t, g: (t, g),
        outputs=[transformers_chat, transcript],
        queue=False,
        show_progress="hidden",
    )

    # Text input: when you click "Send", process the typed message.
    send_button.click(
        respond_text,
        inputs=[text_input, transformers_chat, transcript],
        outputs=[transformers_chat, transcript],
    )
    # Optionally clear the text box after sending:
    send_button.click(lambda: "", inputs=[], outputs=[text_input])

if __name__ == "__main__":
    demo.launch()