import os import torch import argparse import gradio as gr import requests import langid from openvoice import se_extractor from openvoice.api import BaseSpeakerTTS, ToneColorConverter from dotenv import load_dotenv from openai import OpenAI from elevenlabs.client import ElevenLabs from elevenlabs import play,save load_dotenv() # Argument parsing parser = argparse.ArgumentParser() parser.add_argument("--share", action='store_true', default=False, help="make link public") args = parser.parse_args() client = ElevenLabs(api_key=os.environ.get("ELEVENLABS_API_KEY")) device = 'cuda' if torch.cuda.is_available() else 'cpu' output_dir = 'outputs' os.makedirs(output_dir, exist_ok=True) api_key = os.environ.get("ELEVENLABS_API_KEY") supported_languages = ['zh', 'en'] # Function to get all voices def get_voices(api_key): url = "https://api.elevenlabs.io/v1/voices" headers = {"xi-api-key": api_key} response = requests.request("GET", url, headers=headers) return response.json() # Function to delete a voice by ID def delete_voice(api_key, voice_id): url = f"https://api.elevenlabs.io/v1/voices/{voice_id}" headers = {"xi-api-key": api_key} response = requests.request("DELETE", url, headers=headers) return response.status_code, response.text def predict(prompt, style, audio_file_pth): text_hint = '' if len(prompt) < 2: text_hint += "[ERROR] Please provide a longer prompt text.\n" return text_hint, None, None if len(prompt) > 200: text_hint += "[ERROR] Text length limited to 200 characters. Please try shorter text.\n" return text_hint, None, None print(audio_file_pth) voice = client.clone( name="TrialVoice", description="A trial voice model for testing", files=[audio_file_pth], ) #text should be prompt audio = client.generate(text=prompt, voice=voice) save(audio, f'{output_dir}/output.wav') save_path = f'{output_dir}/output.wav' data = get_voices(api_key) # Find all voice IDs with the name "TrialVoice" trial_voice_ids = [voice.get("voice_id") for voice in data['voices'] if voice.get("name") == "TrialVoice"] # Delete each voice with the name "TrialVoice" for voice_id in trial_voice_ids: status_code, response_text = delete_voice(api_key, voice_id) print(f"Deleted voice ID {voice_id}: Status Code {status_code}, Response {response_text}") if not trial_voice_ids: print("No voices with the name 'TrialVoice' found.") return text_hint, save_path, audio_file_pth # Gradio interface setup with gr.Blocks(gr.themes.Glass()) as demo: with gr.Row(): with gr.Column(): input_text_gr = gr.Textbox( label="Text Prompt", info="One or two sentences at a time is better. Up to 200 text characters.", value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.", ) style_gr = gr.Dropdown( label="Style", choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'], info="Please upload a reference audio file that is atleast 1 minute long. For best results, ensure the audio is clear. You can use Adobe Podcast Enhance(https://podcast.adobe.com/enhance) to improve the audio quality before uploading.", max_choices=1, value="default", ) ref_gr = gr.Audio( label="Reference Audio", type="filepath", value="resources/demo_speaker2.mp3", ) tts_button = gr.Button("Send", elem_id="send-btn", visible=True) with gr.Column(): out_text_gr = gr.Text(label="Info") audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True) ref_audio_gr = gr.Audio(label="Reference Audio Used") tts_button.click(predict, [input_text_gr, style_gr, ref_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr]) demo.queue() demo.launch(debug=True, show_api=False, share=args.share) # Hide Gradio footer css = "footer {visibility: hidden}"