File size: 3,893 Bytes
4e97450
 
562fd62
 
4e97450
 
 
 
 
562fd62
 
 
4e97450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a891c75
 
 
 
4e97450
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a891c75
 
 
 
 
 
 
 
 
 
 
4e97450
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import streamlit as st
import torch
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf

# Set up the device
device = "cuda:0" if torch.cuda.is_available() else "cpu"

# Load the model and tokenizer
model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")

# Neon-themed styling
st.markdown("""
    <style>
    body {
        background-color: #0f0f0f;
        color: #0fff0f;
    }
    .stTextInput, .stTextArea {
        background-color: #333333;
        color: #0fff0f;
    }
    .stButton > button {
        background-color: #0fff0f;
        color: #0f0f0f;
    }
    </style>
    """, unsafe_allow_html=True)

st.title("🎤 Neon TTS Converter")

# Predefined voice options
voices = {
    "Smooth Female": "A female speaker delivers a slightly expressive and animated speech with a moderate speed and pitch.",
    "Monotone Male": "Jon's voice is monotone yet slightly fast in delivery, with a very close recording that almost has no background noise.",
    "Energetic Youth": "An energetic young speaker with a lively tone and rapid speech, creating a sense of excitement.",
    "Calm Elderly": "An elderly speaker with a calm and slow-paced voice, bringing wisdom and serenity to the speech.",
    "Robotic": "A robotic, artificial voice with a consistent pitch and no variation in tone.",
    "Narrator": "A deep and clear voice, with a strong presence and a slightly slower pace, suitable for narrations.",
    "Whisper": "A soft, whispered voice, with very low volume and an intimate tone.",
    "Formal": "A formal, authoritative voice with clear articulation and a steady pace.",
    "Happy": "A cheerful, upbeat voice with a positive tone and lively intonation.",
    "Mysterious": "A mysterious and low-pitched voice, with slow delivery and a sense of intrigue.",
    "Bass-Heavy Male": "A deep, resonant male voice with a strong bass, ideal for dramatic and powerful delivery.",
    "Actor Voice 1": "An actor's voice with a dynamic range, capable of various emotional tones and expressions.",
    "Actor Voice 2": "A distinct and engaging actor's voice, providing a unique flair and character to the speech."
}

# Sidebar for voice selection
st.sidebar.header("Select Voice")
voice_choice = st.sidebar.selectbox("Choose a Voice", list(voices.keys()))

# Display the selected voice description
st.sidebar.markdown(f"**Description:** {voices[voice_choice]}")

# Input for custom prompt
st.sidebar.header("Custom Prompt")
prompt = st.sidebar.text_area("Enter your custom prompt", value="Hey, how are you doing today?")

# Error handling
try:
    # Generate the TTS output
    if st.sidebar.button("Generate Speech"):
        description = voices[voice_choice]
        input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
        prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

        # Create attention masks
        attention_mask = tokenizer(description, return_tensors="pt").attention_mask.to(device)
        prompt_attention_mask = tokenizer(prompt, return_tensors="pt").attention_mask.to(device)

        # Generate speech
        generation = model.generate(
            input_ids=input_ids, 
            prompt_input_ids=prompt_input_ids,
            attention_mask=attention_mask,
            prompt_attention_mask=prompt_attention_mask
        )
        audio_arr = generation.cpu().numpy().squeeze()

        # Save the audio file
        output_file = "parler_tts_out.wav"
        sf.write(output_file, audio_arr, model.config.sampling_rate)

        # Display the audio player
        st.audio(output_file)
        st.success("Speech generation complete!")
except Exception as e:
    st.error(f"An error occurred: {e}")