File size: 1,591 Bytes
e01375e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import gradio as gr
import numpy as np
import io
import os
from openai import OpenAI
from pydub import AudioSegment
from pydub.playback import play


# Set an environment variable for key
os.environ['OPENAI_API_KEY'] = os.environ.get('OPENAI_API_KEY')

client = OpenAI() # add api_key

def stream_and_yield_audio(text, model, voice):
    response = client.audio.speech.create(
        model=model, #"tts-1", for example
        voice=voice , #"alloy", for example
        input=text,
    )

    # Convert the binary response content to a byte stream
    byte_stream = io.BytesIO(response.content)

    # Read the audio data from the byte stream
    audio = AudioSegment.from_file(byte_stream, format="mp3")

    # Export the audio as WAV format
    sample_width = audio.sample_width
    sample_rate = audio.frame_rate
    audio_data = np.array(audio.get_array_of_samples(), dtype=np.int16)

    # Yield the audio data
    yield sample_rate, audio_data #audio_data.tobytes(), sample_width
    

# demo using older gradio version (3.50.2) 
with gr.Blocks() as demo:
    with gr.Row():
      model = gr.Dropdown(choices=['tts-1','tts-1-hd'], label='Model', value='tts-1')
      voice = gr.Dropdown(choices=['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'], label='Voice Options', value='alloy')

    text = gr.Textbox(label="Input text")
    btn = gr.Button("Greet")
    output_audio = gr.Audio(label="Speech Output", streaming=True, autoplay=True)
    
    btn.click(fn=stream_and_yield_audio, inputs=[text,model, voice], outputs=output_audio, api_name="tts-stream")

demo.queue().launch()