ysharma's picture
ysharma HF staff
Create app.py
e01375e
import gradio as gr
import numpy as np
import io
import os
from openai import OpenAI
from pydub import AudioSegment
from pydub.playback import play
# Set an environment variable for key
os.environ['OPENAI_API_KEY'] = os.environ.get('OPENAI_API_KEY')
client = OpenAI() # add api_key
def stream_and_yield_audio(text, model, voice):
response = client.audio.speech.create(
model=model, #"tts-1", for example
voice=voice , #"alloy", for example
input=text,
)
# Convert the binary response content to a byte stream
byte_stream = io.BytesIO(response.content)
# Read the audio data from the byte stream
audio = AudioSegment.from_file(byte_stream, format="mp3")
# Export the audio as WAV format
sample_width = audio.sample_width
sample_rate = audio.frame_rate
audio_data = np.array(audio.get_array_of_samples(), dtype=np.int16)
# Yield the audio data
yield sample_rate, audio_data #audio_data.tobytes(), sample_width
# demo using older gradio version (3.50.2)
with gr.Blocks() as demo:
with gr.Row():
model = gr.Dropdown(choices=['tts-1','tts-1-hd'], label='Model', value='tts-1')
voice = gr.Dropdown(choices=['alloy', 'echo', 'fable', 'onyx', 'nova', 'shimmer'], label='Voice Options', value='alloy')
text = gr.Textbox(label="Input text")
btn = gr.Button("Greet")
output_audio = gr.Audio(label="Speech Output", streaming=True, autoplay=True)
btn.click(fn=stream_and_yield_audio, inputs=[text,model, voice], outputs=output_audio, api_name="tts-stream")
demo.queue().launch()