jkeisling commited on
Commit
cff6a85
·
1 Parent(s): 16f2323

here goes nothing

Browse files
Files changed (2) hide show
  1. app.py +121 -4
  2. requirements.txt +3 -0
app.py CHANGED
@@ -1,7 +1,124 @@
1
  import gradio as gr
 
 
 
 
 
 
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
+ from fish_speech import LM
3
+ import re
4
+ from rustymimi import Tokenizer
5
+ from huggingface_hub import snapshot_download, hf_hub_download
6
+ import numpy as np
7
+ import spaces
8
 
9
+ # Voice mapping dictionary:
10
+ # US voices
11
+ # heart (default) -> <|speaker:0|>
12
+ # bella -> <|speaker:1|>
13
+ # nova -> <|speaker:2|>
14
+ # sky -> <|speaker:3|>
15
+ # sarah -> <|speaker:4|>
16
+ # michael -> <|speaker:5|>
17
+ # fenrir -> <|speaker:6|>
18
+ # liam -> <|speaker:7|>
19
+ # British voices
20
+ # emma -> <|speaker:8|>
21
+ # isabella -> <|speaker:9|>
22
+ # fable -> <|speaker:10|>
23
+ voice_mapping = {
24
+ "Heart (US)": "<|speaker:0|>",
25
+ "Bella (US)": "<|speaker:1|>",
26
+ "Nova (US)": "<|speaker:2|>",
27
+ "Sky (US)": "<|speaker:3|>",
28
+ "Sarah (US)": "<|speaker:4|>",
29
+ "Michael (US)": "<|speaker:5|>",
30
+ "Fenrir (US)": "<|speaker:6|>",
31
+ "Liam (US)": "<|speaker:7|>",
32
+ "Emma (UK)": "<|speaker:8|>",
33
+ "Isabella (UK)": "<|speaker:9|>",
34
+ "Fable (UK)": "<|speaker:10|>",
35
+ }
36
 
37
+ # Initialize models
38
+ print("Downloading and initializing models...")
39
+
40
+
41
+ def get_mimi_path():
42
+ """Get Mimi tokenizer weights from Hugging Face."""
43
+ repo_id = "kyutai/moshiko-mlx-bf16"
44
+ filename = "tokenizer-e351c8d8-checkpoint125.safetensors"
45
+ return hf_hub_download(repo_id, filename)
46
+
47
+
48
+ dir = snapshot_download("jkeisling/smoltts_v0")
49
+ mimi_path = get_mimi_path()
50
+ # lm = LM(dir, dtype="bf16", device="cuda", version="dual_ar")
51
+ codec = Tokenizer(mimi_path)
52
+
53
+
54
+ # Naively split text into sentences
55
+ def split_sentences(text):
56
+ sentences = re.split(r"(?<=[?.!])\s+", text)
57
+ return [s.strip() for s in sentences if s.strip()]
58
+
59
+
60
+ @spaces.GPU
61
+ def synthesize_speech(text, temperature, top_p, voice):
62
+ """Generate speech from text using Fish Speech, processing each sentence separately."""
63
+ lm = LM(dir, dtype="bf16", device="cuda", version="dual_ar")
64
+ sysprompt = voice_mapping.get(voice, "<|speaker:0|>")
65
+ sentences = split_sentences(text)
66
+ pcm_list = []
67
+
68
+ for sentence in sentences:
69
+ # Generate audio for each sentence individually
70
+ generated = lm([sentence], temp=temperature, top_p=top_p, sysprompt=sysprompt)
71
+ pcm = codec.decode(generated)
72
+ pcm_list.append(pcm.flatten())
73
+
74
+ # Concatenate all PCM arrays into one
75
+ final_pcm = np.concatenate(pcm_list)
76
+ return (24_000, final_pcm)
77
+
78
+
79
+ # Create the Gradio interface
80
+ with gr.Blocks(
81
+ theme=gr.themes.Default(
82
+ font=[gr.themes.GoogleFont("IBM Plex Sans"), "Arial", "sans-serif"],
83
+ font_mono=gr.themes.GoogleFont("IBM Plex Mono"),
84
+ primary_hue=gr.themes.colors.blue,
85
+ secondary_hue=gr.themes.colors.slate,
86
+ )
87
+ ) as demo:
88
+ with gr.Row():
89
+ gr.Markdown("""
90
+ # SmolTTS v0
91
+
92
+ SmolTTS v0 is an autoregressive 150M parameter character-level text-to-speech model pretrained with an [RQTransformer backbone](https://arxiv.org/abs/2203.01941) and paired with a pretrained [Mimi codec](https://arxiv.org/abs/2410.00037) vocoder. Designed for US and UK English, it was trained entirely on synthetic speech data generated using [Kokoro TTS](https://huggingface.co/hexgrad/Kokoro-82M). SmolTTS is Apache 2.0 licensed - enjoy!
93
+ """)
94
+
95
+ with gr.Row():
96
+ with gr.Column():
97
+ input_text = gr.Textbox(
98
+ label="Input Text", placeholder="Enter text to synthesize...", lines=3
99
+ )
100
+ voice_dropdown = gr.Dropdown(
101
+ label="Voice",
102
+ choices=list(voice_mapping.keys()),
103
+ value="heart (US)",
104
+ info="Select a voice (sysprompt mapping)",
105
+ )
106
+ with gr.Row():
107
+ temperature = gr.Slider(
108
+ minimum=0.0, maximum=1.0, value=0.1, step=0.1, label="Temperature"
109
+ )
110
+ top_p = gr.Slider(
111
+ minimum=0.0, maximum=1.0, value=0.85, step=0.01, label="Top P"
112
+ )
113
+ with gr.Column():
114
+ audio_output = gr.Audio(label="Generated Speech", type="numpy")
115
+
116
+ generate_btn = gr.Button("Generate Speech", variant="primary")
117
+ generate_btn.click(
118
+ fn=synthesize_speech,
119
+ inputs=[input_text, temperature, top_p, voice_dropdown],
120
+ outputs=[audio_output],
121
+ )
122
+
123
+ if __name__ == "__main__":
124
+ demo.launch(server_name="0.0.0.0", share=False)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ fish_speech_rs>=0.3.0
2
+ rustymimi
3
+ numpy