NeuralFalcon commited on
Commit
6d89762
·
verified ·
1 Parent(s): 0af6533

Upload 8 files

Browse files
Files changed (8) hide show
  1. .gitignore +6 -0
  2. Kokoro_82M_Colab.ipynb +51 -0
  3. README.md +124 -11
  4. api.py +76 -0
  5. app.py +262 -0
  6. download_model.py +174 -0
  7. requirements.txt +14 -0
  8. srt_dubbing.py +557 -0
.gitignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ kokoro_audio/
2
+ KOKORO/voices/
3
+ cache/
4
+ __pycache__/
5
+ run_app.bat
6
+ *.pth
Kokoro_82M_Colab.ipynb ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 0,
4
+ "metadata": {
5
+ "colab": {
6
+ "provenance": [],
7
+ "gpuType": "T4"
8
+ },
9
+ "kernelspec": {
10
+ "name": "python3",
11
+ "display_name": "Python 3"
12
+ },
13
+ "language_info": {
14
+ "name": "python"
15
+ },
16
+ "accelerator": "GPU"
17
+ },
18
+ "cells": [
19
+ {
20
+ "cell_type": "code",
21
+ "source": [
22
+ "%cd /content/\n",
23
+ "!git clone https://github.com/NeuralFalconYT/Kokoro-82M-WebUI.git\n",
24
+ "!apt-get -qq -y install espeak-ng > /dev/null 2>&1\n",
25
+ "%cd /content/Kokoro-82M-WebUI\n",
26
+ "!python download_model.py\n",
27
+ "!pip install -r requirements.txt\n",
28
+ "from IPython.display import clear_output\n",
29
+ "clear_output()"
30
+ ],
31
+ "metadata": {
32
+ "id": "stDJD3G4KJwP"
33
+ },
34
+ "execution_count": null,
35
+ "outputs": []
36
+ },
37
+ {
38
+ "cell_type": "code",
39
+ "source": [
40
+ "%cd /content/Kokoro-82M-WebUI\n",
41
+ "!python app.py --share\n",
42
+ "# !python srt_dubbing.py --share"
43
+ ],
44
+ "metadata": {
45
+ "id": "XSQ2ShKtC1u9"
46
+ },
47
+ "execution_count": null,
48
+ "outputs": []
49
+ }
50
+ ]
51
+ }
README.md CHANGED
@@ -1,14 +1,127 @@
 
 
 
 
 
 
 
 
1
  ---
2
- title: Kokoro TTS
3
- emoji: 👀
4
- colorFrom: pink
5
- colorTo: purple
6
- sdk: gradio
7
- sdk_version: 5.12.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- short_description: Kokoro TTS WebUI
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Kokoro-TTS
2
+
3
+ **Note:** This is not the official repository. Alternatives [kokoro-onnx](https://github.com/thewh1teagle/kokoro-onnx), [Kokoro-FastAPI](https://github.com/remsky/Kokoro-FastAPI), [kokoro](https://github.com/hexgrad/kokoro), [kokoro-web](https://huggingface.co/spaces/webml-community/kokoro-web), [Kokoro-Custom-Voice](https://huggingface.co/spaces/ysharma/Make_Custom_Voices_With_KokoroTTS)
4
+
5
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NeuralFalconYT/Kokoro-82M-WebUI/blob/main/Kokoro_82M_Colab.ipynb) <br>
6
+ [![HuggingFace Space Demo](https://img.shields.io/badge/🤗-Space%20demo-yellow)](https://huggingface.co/spaces/hexgrad/Kokoro-TTS)
7
+
8
+
9
  ---
10
+
11
+ ### Installation Tutorial
12
+
13
+ My Python Version is 3.10.9.
14
+
15
+ #### 1. Clone the GitHub Repository:
16
+ ```bash
17
+ git clone https://github.com/NeuralFalconYT/Kokoro-82M-WebUI.git
18
+ cd Kokoro-82M-WebUI
19
+ ```
20
+
21
+ #### 2. Create a Python Virtual Environment:
22
+ ```bash
23
+ python -m venv myenv
24
+ ```
25
+ This command creates a new Python virtual environment named `myenv` for isolating dependencies.
26
+
27
+ #### 3. Activate the Virtual Environment:
28
+ - **For Windows:**
29
+ ```bash
30
+ myenv\Scripts\activate
31
+ ```
32
+ - **For Linux:**
33
+ ```bash
34
+ source myenv/bin/activate
35
+ ```
36
+ This activates the virtual environment, enabling you to install and run dependencies in an isolated environment.
37
+ Here’s the corrected version of point 4, with proper indentation for the subpoints:
38
+
39
+
40
+ #### 4. Install PyTorch:
41
+
42
+ - **For GPU (CUDA-enabled installation):**
43
+ - Check CUDA Version (for GPU setup):
44
+ ```bash
45
+ nvcc --version
46
+ ```
47
+ Find your CUDA version example ```11.8```
48
+
49
+ - Visit [PyTorch Get Started](https://pytorch.org/get-started/locally/) and install the version compatible with your CUDA setup.:<br>
50
+ - For CUDA 11.8:
51
+ ```
52
+ pip install torch --index-url https://download.pytorch.org/whl/cu118
53
+ ```
54
+ - For CUDA 12.1:
55
+ ```
56
+ pip install torch --index-url https://download.pytorch.org/whl/cu121
57
+ ```
58
+ - For CUDA 12.4:
59
+ ```
60
+ pip install torch --index-url https://download.pytorch.org/whl/cu124
61
+ ```
62
+ - **For CPU (if not using GPU):**
63
+ ```bash
64
+ pip install torch
65
+ ```
66
+ This installs the CPU-only version of PyTorch.
67
+
68
+
69
+ #### 5. Install Required Dependencies:
70
+ ```bash
71
+ pip install -r requirements.txt
72
+ ```
73
+ This installs all the required Python libraries listed in the `requirements.txt` file.
74
+
75
+ #### 6. Download Model and Get Latest VoicePack:
76
+ ```bash
77
+ python download_model.py
78
+ ```
79
+
80
+ ---
81
+
82
+ #### 7. Install eSpeak NG
83
+
84
+ - **For Windows:**
85
+ 1. Download the latest eSpeak NG release from the [eSpeak NG GitHub Releases](https://github.com/espeak-ng/espeak-ng/releases/tag/1.51).
86
+ 2. Locate and download the file named **`espeak-ng-X64.msi`**.
87
+ 3. Run the installer and follow the installation steps. Ensure that you install eSpeak NG in the default directory:
88
+ ```
89
+ C:\Program Files\eSpeak NG
90
+ ```
91
+ > **Note:** This default path is required for the application to locate eSpeak NG properly.
92
+
93
+ - **For Linux:**
94
+ 1. Open your terminal.
95
+ 2. Install eSpeak NG using the following command:
96
+ ```bash
97
+ sudo apt-get -qq -y install espeak-ng > /dev/null 2>&1
98
+ ```
99
+ > **Note:** This command suppresses unnecessary output for a cleaner installation process.
100
+
101
  ---
102
 
103
+ #### 8. Run Gradio App
104
+
105
+ To run the Gradio app, follow these steps:
106
+
107
+ 1. **Activate the Virtual Environment:**
108
+ ```bash
109
+ myenv\Scripts\activate
110
+ ```
111
+
112
+ 2. **Run the Application:**
113
+ ```bash
114
+ python app.py
115
+ ```
116
+
117
+ Alternatively, on Windows, double-click on `run_app.bat` to start the application.
118
+
119
+ ---
120
+
121
+ ![app](https://github.com/user-attachments/assets/ef3e7c0f-8e72-471d-9639-5327b4f06b29)
122
+ ![Podcast](https://github.com/user-attachments/assets/03ddd9ee-5b41-4acb-b0c3-53ef5b1a7fbf)
123
+ ![voices](https://github.com/user-attachments/assets/d47f803c-b3fb-489b-bc7b-f08020401ce5)
124
+
125
+ ### Credits
126
+ [Kokoro HuggingFace](https://huggingface.co/hexgrad/Kokoro-82M)
127
+
api.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # It is helpful if you want to use it in a voice assistant project.
2
+ # Know more about {your gradio app url}/?view=api. Example: http://127.0.0.1:7860/?view=api
3
+ import shutil
4
+ import os
5
+ from gradio_client import Client
6
+
7
+ # Ensure the output directory exists
8
+ output_dir = "temp_audio"
9
+ os.makedirs(output_dir, exist_ok=True)
10
+
11
+ # Initialize the Gradio client
12
+ api_url = "http://127.0.0.1:7860/"
13
+ client = Client(api_url)
14
+
15
+ def text_to_speech(
16
+ text="Hello!!",
17
+ model_name="kokoro-v0_19.pth",
18
+ voice_name="af_bella",
19
+ speed=1,
20
+ trim=0,
21
+ pad_between_segments=0,
22
+ remove_silence=False,
23
+ minimum_silence=0.05,
24
+ ):
25
+ """
26
+ Generates speech from text using a specified model and saves the audio file.
27
+
28
+ Parameters:
29
+ text (str): The text to convert to speech.
30
+ model_name (str): The name of the model to use for synthesis.
31
+ voice_name (str): The name of the voice to use.
32
+ speed (float): The speed of speech.
33
+ trim (int): Whether to trim silence at the beginning and end.
34
+ pad_between_segments (int): Padding between audio segments.
35
+ remove_silence (bool): Whether to remove silence from the audio.
36
+ minimum_silence (float): Minimum silence duration to consider.
37
+ Returns:
38
+ str: Path to the saved audio file.
39
+ """
40
+ # Call the API with provided parameters
41
+ result = client.predict(
42
+ text=text,
43
+ model_name=model_name,
44
+ voice_name=voice_name,
45
+ speed=speed,
46
+ trim=trim,
47
+ pad_between_segments=pad_between_segments,
48
+ remove_silence=remove_silence,
49
+ minimum_silence=minimum_silence,
50
+ api_name="/text_to_speech"
51
+ )
52
+
53
+ # Save the audio file in the specified directory
54
+ save_at = f"{output_dir}/{os.path.basename(result)}"
55
+ shutil.move(result, save_at)
56
+ print(f"Saved at {save_at}")
57
+
58
+ return save_at
59
+
60
+ # Example usage
61
+ if __name__ == "__main__":
62
+ text="This is Kokoro TTS. I am a text-to-speech model and Super Fast."
63
+ model_name="kokoro-v0_19.pth" #kokoro-v0_19-half.pth
64
+ voice_name="af_bella" #get voice names
65
+ speed=1
66
+ only_trim_both_ends_silence=0
67
+ add_silence_between_segments=0 #it use in large text
68
+ remove_silence=False
69
+ keep_silence_upto=0.05 #in seconds
70
+ audio_path = text_to_speech(text=text, model_name=model_name,
71
+ voice_name=voice_name, speed=speed,
72
+ trim=only_trim_both_ends_silence,
73
+ pad_between_segments=add_silence_between_segments,
74
+ remove_silence=remove_silence,
75
+ minimum_silence=keep_silence_upto)
76
+ print(f"Audio file saved at: {audio_path}")
app.py ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from KOKORO.models import build_model
2
+ from KOKORO.utils import tts,tts_file_name,podcast
3
+ import sys
4
+ sys.path.append('.')
5
+ import torch
6
+ import gc
7
+ print("Loading model...")
8
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
9
+ print(f'Using device: {device}')
10
+ MODEL = build_model('./KOKORO/kokoro-v0_19.pth', device)
11
+ print("Model loaded successfully.")
12
+
13
+ def tts_maker(text,voice_name="af_bella",speed = 0.8,trim=0,pad_between=0,save_path="temp.wav",remove_silence=False,minimum_silence=50):
14
+ # Sanitize the save_path to remove any newline characters
15
+ save_path = save_path.replace('\n', '').replace('\r', '')
16
+ global MODEL
17
+ audio_path=tts(MODEL,device,text,voice_name,speed=speed,trim=trim,pad_between_segments=pad_between,output_file=save_path,remove_silence=remove_silence,minimum_silence=minimum_silence)
18
+ return audio_path
19
+
20
+
21
+ model_list = ["kokoro-v0_19.pth", "kokoro-v0_19-half.pth"]
22
+ current_model = model_list[0]
23
+
24
+ def update_model(model_name):
25
+ """
26
+ Updates the TTS model only if the specified model is not already loaded.
27
+ """
28
+ global MODEL, current_model
29
+ if current_model == model_name:
30
+ return f"Model already set to {model_name}" # No need to reload
31
+ model_path = f"./KOKORO/{model_name}" # Default model path
32
+ if model_name == "kokoro-v0_19-half.pth":
33
+ model_path = f"./KOKORO/fp16/{model_name}" # Update path for specific model
34
+ # print(f"Loading new model: {model_name}")
35
+ del MODEL # Cleanup existing model
36
+ gc.collect()
37
+ torch.cuda.empty_cache() # Ensure GPU memory is cleared
38
+ MODEL = build_model(model_path, device)
39
+ current_model = model_name
40
+ return f"Model updated to {model_name}"
41
+
42
+
43
+ def text_to_speech(text, model_name, voice_name, speed, trim, pad_between_segments, remove_silence, minimum_silence):
44
+ """
45
+ Converts text to speech using the specified parameters and ensures the model is updated only if necessary.
46
+ """
47
+ update_status = update_model(model_name) # Load the model only if required
48
+ # print(update_status) # Log model loading status
49
+ if not minimum_silence:
50
+ minimum_silence = 0.05
51
+ keep_silence = int(minimum_silence * 1000)
52
+ save_at = tts_file_name(text)
53
+ audio_path = tts_maker(
54
+ text,
55
+ voice_name,
56
+ speed,
57
+ trim,
58
+ pad_between_segments,
59
+ save_at,
60
+ remove_silence,
61
+ keep_silence
62
+ )
63
+ return audio_path
64
+
65
+
66
+
67
+
68
+ import gradio as gr
69
+
70
+ # voice_list = [
71
+ # 'af', # Default voice is a 50-50 mix of af_bella & af_sarah
72
+ # 'af_bella', 'af_sarah', 'am_adam', 'am_michael',
73
+ # 'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis',
74
+ # ]
75
+
76
+
77
+
78
+ import os
79
+
80
+ # Get the list of voice names without file extensions
81
+ voice_list = [
82
+ os.path.splitext(filename)[0]
83
+ for filename in os.listdir("./KOKORO/voices")
84
+ if filename.endswith('.pt')
85
+ ]
86
+
87
+ # Sort the list based on the length of each name
88
+ voice_list = sorted(voice_list, key=len)
89
+
90
+ def toggle_autoplay(autoplay):
91
+ return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
92
+
93
+ with gr.Blocks() as demo1:
94
+ gr.Markdown("# Batched TTS")
95
+ with gr.Row():
96
+ with gr.Column():
97
+ text = gr.Textbox(
98
+ label='Enter Text',
99
+ lines=3,
100
+ placeholder="Type your text here..."
101
+ )
102
+ with gr.Row():
103
+ voice = gr.Dropdown(
104
+ voice_list,
105
+ value='af',
106
+ allow_custom_value=False,
107
+ label='Voice',
108
+ info='Starred voices are more stable'
109
+ )
110
+ with gr.Row():
111
+ generate_btn = gr.Button('Generate', variant='primary')
112
+ with gr.Accordion('Audio Settings', open=False):
113
+ model_name=gr.Dropdown(model_list,label="Model",value=model_list[0])
114
+ remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
115
+ minimum_silence = gr.Number(
116
+ label="Keep Silence Upto (In seconds)",
117
+ value=0.05
118
+ )
119
+ speed = gr.Slider(
120
+ minimum=0.25, maximum=2, value=1, step=0.1,
121
+ label='⚡️Speed', info='Adjust the speaking speed'
122
+ )
123
+ trim = gr.Slider(
124
+ minimum=0, maximum=1, value=0, step=0.1,
125
+ label='🔪 Trim', info='How much to cut from both ends of each segment'
126
+ )
127
+ pad_between = gr.Slider(
128
+ minimum=0, maximum=2, value=0, step=0.1,
129
+ label='🔇 Pad Between', info='Silent Duration between segments [For Large Text]'
130
+ )
131
+
132
+ with gr.Column():
133
+ audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
134
+ with gr.Accordion('Enable Autoplay', open=False):
135
+ autoplay = gr.Checkbox(value=True, label='Autoplay')
136
+ autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
137
+
138
+ text.submit(
139
+ text_to_speech,
140
+ inputs=[text, model_name,voice, speed, trim, pad_between, remove_silence, minimum_silence],
141
+ outputs=[audio]
142
+ )
143
+ generate_btn.click(
144
+ text_to_speech,
145
+ inputs=[text,model_name, voice, speed, trim, pad_between, remove_silence, minimum_silence],
146
+ outputs=[audio]
147
+ )
148
+
149
+ def podcast_maker(text,remove_silence=False,minimum_silence=50,model_name="kokoro-v0_19.pth"):
150
+ global MODEL,device
151
+ update_model(model_name)
152
+ if not minimum_silence:
153
+ minimum_silence = 0.05
154
+ keep_silence = int(minimum_silence * 1000)
155
+ podcast_save_at=podcast(MODEL, device,text,remove_silence=remove_silence, minimum_silence=keep_silence)
156
+ return podcast_save_at
157
+
158
+
159
+
160
+ dummpy_example="""{af} Hello, I'd like to order a sandwich please.
161
+ {af_sky} What do you mean you're out of bread?
162
+ {af_bella} I really wanted a sandwich though...
163
+ {af_nicole} You know what, darn you and your little shop!
164
+ {bm_george} I'll just go back home and cry now.
165
+ {am_adam} Why me?"""
166
+ with gr.Blocks() as demo2:
167
+ gr.Markdown(
168
+ """
169
+ # Multiple Speech-Type Generation
170
+ This section allows you to generate multiple speech types or multiple people's voices. Enter your text in the format shown below, and the system will generate speech using the appropriate type. If unspecified, the model will use "af" voice.
171
+ Format:
172
+ {voice_name} your text here
173
+ """
174
+ )
175
+ with gr.Row():
176
+ gr.Markdown(
177
+ """
178
+ **Example Input:**
179
+ {af} Hello, I'd like to order a sandwich please.
180
+ {af_sky} What do you mean you're out of bread?
181
+ {af_bella} I really wanted a sandwich though...
182
+ {af_nicole} You know what, darn you and your little shop!
183
+ {bm_george} I'll just go back home and cry now.
184
+ {am_adam} Why me?!
185
+ """
186
+ )
187
+ with gr.Row():
188
+ with gr.Column():
189
+ text = gr.Textbox(
190
+ label='Enter Text',
191
+ lines=7,
192
+ placeholder=dummpy_example
193
+ )
194
+ with gr.Row():
195
+ generate_btn = gr.Button('Generate', variant='primary')
196
+ with gr.Accordion('Audio Settings', open=False):
197
+ remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
198
+ minimum_silence = gr.Number(
199
+ label="Keep Silence Upto (In seconds)",
200
+ value=0.20
201
+ )
202
+ with gr.Column():
203
+ audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
204
+ with gr.Accordion('Enable Autoplay', open=False):
205
+ autoplay = gr.Checkbox(value=True, label='Autoplay')
206
+ autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
207
+
208
+ text.submit(
209
+ podcast_maker,
210
+ inputs=[text, remove_silence, minimum_silence],
211
+ outputs=[audio]
212
+ )
213
+ generate_btn.click(
214
+ podcast_maker,
215
+ inputs=[text, remove_silence, minimum_silence],
216
+ outputs=[audio]
217
+ )
218
+
219
+ display_text = " \n".join(voice_list)
220
+
221
+ with gr.Blocks() as demo3:
222
+ gr.Markdown(f"# Voice Names \n{display_text}")
223
+
224
+ import click
225
+ @click.command()
226
+ @click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
227
+ @click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
228
+ def main(debug, share):
229
+ demo = gr.TabbedInterface([demo1, demo2,demo3], ["Batched TTS", "Multiple Speech-Type Generation","Available Voice Names"],title="Kokoro TTS")
230
+
231
+ demo.queue().launch(debug=debug, share=share)
232
+ #Run on local network
233
+ # laptop_ip="192.168.0.30"
234
+ # port=8080
235
+ # demo.queue().launch(debug=debug, share=share,server_name=laptop_ip,server_port=port)
236
+
237
+ if __name__ == "__main__":
238
+ main()
239
+
240
+
241
+ ##For client side
242
+ # from gradio_client import Client
243
+ # import shutil
244
+ # import os
245
+ # os.makedirs("temp_audio", exist_ok=True)
246
+ # from gradio_client import Client
247
+ # client = Client("http://127.0.0.1:7860/")
248
+ # result = client.predict(
249
+ # text="Hello!!",
250
+ # model_name="kokoro-v0_19.pth",
251
+ # voice_name="af_bella",
252
+ # speed=1,
253
+ # trim=0,
254
+ # pad_between_segments=0,
255
+ # remove_silence=False,
256
+ # minimum_silence=0.05,
257
+ # api_name="/text_to_speech"
258
+ # )
259
+
260
+ # save_at=f"./temp_audio/{os.path.basename(result)}"
261
+ # shutil.move(result, save_at)
262
+ # print(f"Saved at {save_at}")
download_model.py ADDED
@@ -0,0 +1,174 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from huggingface_hub import list_repo_files, hf_hub_download
2
+ import os
3
+ import shutil
4
+
5
+ # Repository ID
6
+ repo_id = "hexgrad/Kokoro-82M"
7
+
8
+ # Set up the cache directory
9
+ cache_dir = "./cache" # Customize this path if needed
10
+ os.makedirs(cache_dir, exist_ok=True)
11
+
12
+ def get_voice_models():
13
+ # Ensure the 'voices' directory exists
14
+ voices_dir = './KOKORO/voices'
15
+ if os.path.exists(voices_dir):
16
+ shutil.rmtree(voices_dir)
17
+ os.makedirs(voices_dir, exist_ok=True)
18
+
19
+ # Get the list of all files
20
+ files = list_repo_files(repo_id)
21
+
22
+ # Filter files for the 'voices/' folder
23
+ voice_files = [file.replace("voices/", "") for file in files if file.startswith("voices/")]
24
+
25
+ # Get current files in the 'voices' folder
26
+ current_voice = os.listdir(voices_dir)
27
+
28
+ # Identify files that need to be downloaded
29
+ download_voice = [file for file in voice_files if file not in current_voice]
30
+ if download_voice:
31
+ print(f"Files to download: {download_voice}")
32
+
33
+ # Download each missing file
34
+ for file in download_voice:
35
+ file_path = hf_hub_download(repo_id=repo_id, filename=f"voices/{file}", cache_dir=cache_dir)
36
+ target_path = os.path.join(voices_dir, file)
37
+ shutil.copy(file_path, target_path)
38
+ print(f"Downloaded: {file} to {target_path}")
39
+
40
+ # Call the function to execute the code
41
+ get_voice_models()
42
+
43
+ # Check and download additional required files with caching
44
+ kokoro_file = "kokoro-v0_19.pth"
45
+ fp16_file = "fp16/kokoro-v0_19-half.pth"
46
+
47
+ if kokoro_file not in os.listdir("./KOKORO/"):
48
+ file_path = hf_hub_download(repo_id=repo_id, filename=kokoro_file, cache_dir=cache_dir)
49
+ shutil.copy(file_path, os.path.join("./KOKORO/", kokoro_file))
50
+ print(f"Downloaded: {kokoro_file} to ./KOKORO/")
51
+
52
+ if "fp16" not in os.listdir("./KOKORO/"):
53
+ os.makedirs("./KOKORO/fp16", exist_ok=True)
54
+
55
+ if os.path.basename(fp16_file) not in os.listdir("./KOKORO/fp16/"):
56
+ file_path = hf_hub_download(repo_id=repo_id, filename=fp16_file, cache_dir=cache_dir)
57
+ shutil.copy(file_path, os.path.join("./KOKORO/fp16/", os.path.basename(fp16_file)))
58
+ print(f"Downloaded: {os.path.basename(fp16_file)} to ./KOKORO/fp16/")
59
+
60
+
61
+
62
+
63
+ #For Windows one click run
64
+ import os
65
+ import platform
66
+
67
+ def setup_batch_file():
68
+ # Check if the system is Windows
69
+ if platform.system() == "Windows":
70
+ # Check if 'run.bat' exists in the current folder
71
+ if os.path.exists("run.bat"):
72
+ print("'run.bat' already exists in the current folder.")
73
+ else:
74
+ # Content for run_app.bat
75
+ bat_content_app = '''@echo off
76
+ call myenv\\Scripts\\activate
77
+ @python.exe app.py %*
78
+ @pause
79
+ '''
80
+ # Save the content to run_app.bat
81
+ with open('run_app.bat', 'w') as bat_file:
82
+ bat_file.write(bat_content_app)
83
+ print("The 'run_app.bat' file has been created.")
84
+ else:
85
+ print("This system is not Windows. Batch file creation skipped.")
86
+
87
+ # Run the setup function
88
+ setup_batch_file()
89
+
90
+
91
+
92
+
93
+ import torch
94
+ import os
95
+ from itertools import combinations
96
+
97
+ def mix_all_voices(folder_path="./KOKORO/voices"):
98
+ """Mix all pairs of voice models and save the new models."""
99
+ # Get the list of available voice packs
100
+ available_voice_pack = [
101
+ os.path.splitext(filename)[0]
102
+ for filename in os.listdir(folder_path)
103
+ if filename.endswith('.pt')
104
+ ]
105
+
106
+ # Generate all unique pairs of voices
107
+ voice_combinations = combinations(available_voice_pack, 2)
108
+
109
+ # def mix_model(voice_1, voice_2, weight_1=0.6, weight_2=0.4):
110
+ # """Mix two voice models with a weighted average and save the new model."""
111
+ # new_name = f"{voice_1}_mix_{voice_2}"
112
+ # voice_id_1 = torch.load(f'{folder_path}/{voice_1}.pt', weights_only=True)
113
+ # voice_id_2 = torch.load(f'{folder_path}/{voice_2}.pt', weights_only=True)
114
+
115
+ # # Create the mixed model using a weighted average
116
+ # mixed_voice = (weight_1 * voice_id_1) + (weight_2 * voice_id_2)
117
+
118
+ # # Save the mixed model
119
+ # torch.save(mixed_voice, f'{folder_path}/{new_name}.pt')
120
+ # print(f"Created new voice model: {new_name}")
121
+
122
+
123
+
124
+ # Function to mix two voices
125
+ def mix_model(voice_1, voice_2):
126
+ """Mix two voice models and save the new model."""
127
+ new_name = f"{voice_1}_mix_{voice_2}"
128
+ voice_id_1 = torch.load(f'{folder_path}/{voice_1}.pt', weights_only=True)
129
+ voice_id_2 = torch.load(f'{folder_path}/{voice_2}.pt', weights_only=True)
130
+
131
+ # Create the mixed model by averaging the weights
132
+ mixed_voice = torch.mean(torch.stack([voice_id_1, voice_id_2]), dim=0)
133
+
134
+ # Save the mixed model
135
+ torch.save(mixed_voice, f'{folder_path}/{new_name}.pt')
136
+ print(f"Created new voice model: {new_name}")
137
+
138
+ # Create mixed voices for each pair
139
+ for voice_1, voice_2 in voice_combinations:
140
+ print(f"Mixing {voice_1} ❤️ {voice_2}")
141
+ mix_model(voice_1, voice_2)
142
+
143
+ # Call the function to mix all voices
144
+ mix_all_voices("./KOKORO/voices")
145
+
146
+
147
+ def save_voice_names(directory="./KOKORO/voices", output_file="./voice_names.txt"):
148
+ """
149
+ Retrieves voice names from a directory, sorts them by length, and saves to a file.
150
+
151
+ Parameters:
152
+ directory (str): Directory containing the voice files.
153
+ output_file (str): File to save the sorted voice names.
154
+
155
+ Returns:
156
+ None
157
+ """
158
+ # Get the list of voice names without file extensions
159
+ voice_list = [
160
+ os.path.splitext(filename)[0]
161
+ for filename in os.listdir(directory)
162
+ if filename.endswith('.pt')
163
+ ]
164
+
165
+ # Sort the list based on the length of each name
166
+ voice_list = sorted(voice_list, key=len)
167
+
168
+ # Save the sorted list to the specified file
169
+ with open(output_file, "w") as f:
170
+ for voice_name in voice_list:
171
+ f.write(f"{voice_name}\n")
172
+
173
+ print(f"Voice names saved to {output_file}")
174
+ save_voice_names()
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ phonemizer>=3.3.0
2
+ scipy>=1.14.1
3
+ munch>=4.0.0
4
+ transformers>=4.47.1
5
+ click>=8.1.8
6
+ librosa>=0.10.2
7
+ simpleaudio>=1.0.4
8
+ gradio>=5.9.1
9
+ huggingface-hub>=0.27.0
10
+ pydub>=0.25.1
11
+ pysrt>=1.1.2
12
+ # fastapi>=0.115.6
13
+ # uvicorn>=0.34.0
14
+ # torch
srt_dubbing.py ADDED
@@ -0,0 +1,557 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from KOKORO.models import build_model
2
+ from KOKORO.utils import tts,tts_file_name,podcast
3
+ import sys
4
+ sys.path.append('.')
5
+ import torch
6
+ import gc
7
+ print("Loading model...")
8
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
9
+ print(f'Using device: {device}')
10
+ MODEL = build_model('./KOKORO/kokoro-v0_19.pth', device)
11
+ print("Model loaded successfully.")
12
+
13
+ def tts_maker(text,voice_name="af_bella",speed = 0.8,trim=0,pad_between=0,save_path="temp.wav",remove_silence=False,minimum_silence=50):
14
+ # Sanitize the save_path to remove any newline characters
15
+ save_path = save_path.replace('\n', '').replace('\r', '')
16
+ global MODEL
17
+ audio_path=tts(MODEL,device,text,voice_name,speed=speed,trim=trim,pad_between_segments=pad_between,output_file=save_path,remove_silence=remove_silence,minimum_silence=minimum_silence)
18
+ return audio_path
19
+
20
+
21
+ model_list = ["kokoro-v0_19.pth", "kokoro-v0_19-half.pth"]
22
+ current_model = model_list[0]
23
+
24
+ def update_model(model_name):
25
+ """
26
+ Updates the TTS model only if the specified model is not already loaded.
27
+ """
28
+ global MODEL, current_model
29
+ if current_model == model_name:
30
+ return f"Model already set to {model_name}" # No need to reload
31
+ model_path = f"./KOKORO/{model_name}" # Default model path
32
+ if model_name == "kokoro-v0_19-half.pth":
33
+ model_path = f"./KOKORO/fp16/{model_name}" # Update path for specific model
34
+ # print(f"Loading new model: {model_name}")
35
+ del MODEL # Cleanup existing model
36
+ gc.collect()
37
+ torch.cuda.empty_cache() # Ensure GPU memory is cleared
38
+ MODEL = build_model(model_path, device)
39
+ current_model = model_name
40
+ return f"Model updated to {model_name}"
41
+
42
+
43
+
44
+ def text_to_speech(text, model_name="kokoro-v0_19.pth", voice_name="af", speed=1.0, trim=1.0, pad_between_segments=0, remove_silence=True, minimum_silence=0.20):
45
+ """
46
+ Converts text to speech using the specified parameters and ensures the model is updated only if necessary.
47
+ """
48
+ update_status = update_model(model_name) # Load the model only if required
49
+ # print(update_status) # Log model loading status
50
+ if not minimum_silence:
51
+ minimum_silence = 0.05
52
+ keep_silence = int(minimum_silence * 1000)
53
+ save_at = tts_file_name(text)
54
+ audio_path = tts_maker(
55
+ text,
56
+ voice_name,
57
+ speed,
58
+ trim,
59
+ pad_between_segments,
60
+ save_at,
61
+ remove_silence,
62
+ keep_silence
63
+ )
64
+ return audio_path
65
+
66
+
67
+
68
+
69
+ import gradio as gr
70
+
71
+ # voice_list = [
72
+ # 'af', # Default voice is a 50-50 mix of af_bella & af_sarah
73
+ # 'af_bella', 'af_sarah', 'am_adam', 'am_michael',
74
+ # 'bf_emma', 'bf_isabella', 'bm_george', 'bm_lewis',
75
+ # ]
76
+
77
+
78
+
79
+ import os
80
+
81
+ # Get the list of voice names without file extensions
82
+ voice_list = [
83
+ os.path.splitext(filename)[0]
84
+ for filename in os.listdir("./KOKORO/voices")
85
+ if filename.endswith('.pt')
86
+ ]
87
+
88
+ # Sort the list based on the length of each name
89
+ voice_list = sorted(voice_list, key=len)
90
+
91
+ def toggle_autoplay(autoplay):
92
+ return gr.Audio(interactive=False, label='Output Audio', autoplay=autoplay)
93
+
94
+ with gr.Blocks() as demo1:
95
+ gr.Markdown("# Batched TTS")
96
+ with gr.Row():
97
+ with gr.Column():
98
+ text = gr.Textbox(
99
+ label='Enter Text',
100
+ lines=3,
101
+ placeholder="Type your text here..."
102
+ )
103
+ with gr.Row():
104
+ voice = gr.Dropdown(
105
+ voice_list,
106
+ value='af',
107
+ allow_custom_value=False,
108
+ label='Voice',
109
+ info='Starred voices are more stable'
110
+ )
111
+ with gr.Row():
112
+ generate_btn = gr.Button('Generate', variant='primary')
113
+ with gr.Accordion('Audio Settings', open=False):
114
+ model_name=gr.Dropdown(model_list,label="Model",value=model_list[0])
115
+ remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
116
+ minimum_silence = gr.Number(
117
+ label="Keep Silence Upto (In seconds)",
118
+ value=0.05
119
+ )
120
+ speed = gr.Slider(
121
+ minimum=0.25, maximum=2, value=1, step=0.1,
122
+ label='⚡️Speed', info='Adjust the speaking speed'
123
+ )
124
+ trim = gr.Slider(
125
+ minimum=0, maximum=1, value=0, step=0.1,
126
+ label='🔪 Trim', info='How much to cut from both ends of each segment'
127
+ )
128
+ pad_between = gr.Slider(
129
+ minimum=0, maximum=2, value=0, step=0.1,
130
+ label='🔇 Pad Between', info='Silent Duration between segments [For Large Text]'
131
+ )
132
+
133
+ with gr.Column():
134
+ audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
135
+ with gr.Accordion('Enable Autoplay', open=False):
136
+ autoplay = gr.Checkbox(value=True, label='Autoplay')
137
+ autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
138
+
139
+ text.submit(
140
+ text_to_speech,
141
+ inputs=[text, model_name,voice, speed, trim, pad_between, remove_silence, minimum_silence],
142
+ outputs=[audio]
143
+ )
144
+ generate_btn.click(
145
+ text_to_speech,
146
+ inputs=[text,model_name, voice, speed, trim, pad_between, remove_silence, minimum_silence],
147
+ outputs=[audio]
148
+ )
149
+
150
+ def podcast_maker(text,remove_silence=False,minimum_silence=50,model_name="kokoro-v0_19.pth"):
151
+ global MODEL,device
152
+ update_model(model_name)
153
+ if not minimum_silence:
154
+ minimum_silence = 0.05
155
+ keep_silence = int(minimum_silence * 1000)
156
+ podcast_save_at=podcast(MODEL, device,text,remove_silence=remove_silence, minimum_silence=keep_silence)
157
+ return podcast_save_at
158
+
159
+
160
+
161
+ dummpy_example="""{af} Hello, I'd like to order a sandwich please.
162
+ {af_sky} What do you mean you're out of bread?
163
+ {af_bella} I really wanted a sandwich though...
164
+ {af_nicole} You know what, darn you and your little shop!
165
+ {bm_george} I'll just go back home and cry now.
166
+ {am_adam} Why me?"""
167
+ with gr.Blocks() as demo2:
168
+ gr.Markdown(
169
+ """
170
+ # Multiple Speech-Type Generation
171
+ This section allows you to generate multiple speech types or multiple people's voices. Enter your text in the format shown below, and the system will generate speech using the appropriate type. If unspecified, the model will use "af" voice.
172
+ Format:
173
+ {voice_name} your text here
174
+ """
175
+ )
176
+ with gr.Row():
177
+ gr.Markdown(
178
+ """
179
+ **Example Input:**
180
+ {af} Hello, I'd like to order a sandwich please.
181
+ {af_sky} What do you mean you're out of bread?
182
+ {af_bella} I really wanted a sandwich though...
183
+ {af_nicole} You know what, darn you and your little shop!
184
+ {bm_george} I'll just go back home and cry now.
185
+ {am_adam} Why me?!
186
+ """
187
+ )
188
+ with gr.Row():
189
+ with gr.Column():
190
+ text = gr.Textbox(
191
+ label='Enter Text',
192
+ lines=7,
193
+ placeholder=dummpy_example
194
+ )
195
+ with gr.Row():
196
+ generate_btn = gr.Button('Generate', variant='primary')
197
+ with gr.Accordion('Audio Settings', open=False):
198
+ remove_silence = gr.Checkbox(value=False, label='✂️ Remove Silence From TTS')
199
+ minimum_silence = gr.Number(
200
+ label="Keep Silence Upto (In seconds)",
201
+ value=0.20
202
+ )
203
+ with gr.Column():
204
+ audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
205
+ with gr.Accordion('Enable Autoplay', open=False):
206
+ autoplay = gr.Checkbox(value=True, label='Autoplay')
207
+ autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
208
+
209
+ text.submit(
210
+ podcast_maker,
211
+ inputs=[text, remove_silence, minimum_silence],
212
+ outputs=[audio]
213
+ )
214
+ generate_btn.click(
215
+ podcast_maker,
216
+ inputs=[text, remove_silence, minimum_silence],
217
+ outputs=[audio]
218
+ )
219
+
220
+
221
+
222
+
223
+ import shutil
224
+ import os
225
+
226
+ # Ensure the output directory exists
227
+ output_dir = "./temp_audio"
228
+ os.makedirs(output_dir, exist_ok=True)
229
+
230
+
231
+
232
+
233
+
234
+
235
+
236
+
237
+
238
+ #@title Generate Audio File From Subtitle
239
+ # from tqdm.notebook import tqdm
240
+ from tqdm import tqdm
241
+ import subprocess
242
+ import json
243
+ import pysrt
244
+ import os
245
+ from pydub import AudioSegment
246
+ import shutil
247
+ import uuid
248
+ import re
249
+ import time
250
+
251
+ # os.chdir(install_path)
252
+
253
+ def your_tts(text,audio_path,actual_duration,speed=1.0):
254
+ global srt_voice_name
255
+ model_name="kokoro-v0_19.pth"
256
+ tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speed)
257
+ print(tts_path)
258
+ tts_audio = AudioSegment.from_file(tts_path)
259
+ tts_duration = len(tts_audio)
260
+ if tts_duration > actual_duration:
261
+ speedup_factor = tts_duration / actual_duration
262
+ tts_path=text_to_speech(text, model_name, voice_name=srt_voice_name,speed=speedup_factor)
263
+ print(tts_path)
264
+ shutil.copy(tts_path,audio_path)
265
+
266
+
267
+
268
+ base_path="."
269
+ import datetime
270
+ def get_current_time():
271
+ # Return current time as a string in the format HH_MM_AM/PM
272
+ return datetime.datetime.now().strftime("%I_%M_%p")
273
+
274
+ def get_subtitle_Dub_path(srt_file_path,Language="en"):
275
+ file_name = os.path.splitext(os.path.basename(srt_file_path))[0]
276
+ if not os.path.exists(f"{base_path}/TTS_DUB"):
277
+ os.mkdir(f"{base_path}/TTS_DUB")
278
+ random_string = str(uuid.uuid4())[:6]
279
+ new_path=f"{base_path}/TTS_DUB/{file_name}_{Language}_{get_current_time()}_{random_string}.wav"
280
+ return new_path
281
+
282
+
283
+
284
+
285
+
286
+
287
+
288
+
289
+ def clean_srt(input_path):
290
+ file_name = os.path.basename(input_path)
291
+ output_folder = f"{base_path}/save_srt"
292
+ if not os.path.exists(output_folder):
293
+ os.mkdir(output_folder)
294
+ output_path = f"{output_folder}/{file_name}"
295
+
296
+ def clean_srt_line(text):
297
+ bad_list = ["[", "]", "♫", "\n"]
298
+ for i in bad_list:
299
+ text = text.replace(i, "")
300
+ return text.strip()
301
+
302
+ # Load the subtitle file
303
+ subs = pysrt.open(input_path)
304
+
305
+ # Iterate through each subtitle and print its details
306
+ with open(output_path, "w", encoding='utf-8') as file:
307
+ for sub in subs:
308
+ file.write(f"{sub.index}\n")
309
+ file.write(f"{sub.start} --> {sub.end}\n")
310
+ file.write(f"{clean_srt_line(sub.text)}\n")
311
+ file.write("\n")
312
+ file.close()
313
+ # print(f"Clean SRT saved at: {output_path}")
314
+ return output_path
315
+ # Example usage
316
+
317
+
318
+
319
+
320
+
321
+
322
+ class SRTDubbing:
323
+ def __init__(self):
324
+ pass
325
+
326
+ @staticmethod
327
+ def text_to_speech_srt(text, audio_path, language, actual_duration):
328
+ tts_filename = "./cache/temp.wav"
329
+ your_tts(text,tts_filename,actual_duration,speed=1.0)
330
+ # Check the duration of the generated TTS audio
331
+ tts_audio = AudioSegment.from_file(tts_filename)
332
+ tts_duration = len(tts_audio)
333
+
334
+ if actual_duration == 0:
335
+ # If actual duration is zero, use the original TTS audio without modifications
336
+ shutil.move(tts_filename, audio_path)
337
+ return
338
+ # If TTS audio duration is longer than actual duration, speed up the audio
339
+ if tts_duration > actual_duration:
340
+ speedup_factor = tts_duration / actual_duration
341
+ speedup_filename = "./cache/speedup_temp.wav"
342
+ # Use ffmpeg to change audio speed
343
+ subprocess.run([
344
+ "ffmpeg",
345
+ "-i", tts_filename,
346
+ "-filter:a", f"atempo={speedup_factor}",
347
+ speedup_filename,
348
+ "-y"
349
+ ], check=True)
350
+
351
+ # Replace the original TTS audio with the sped-up version
352
+ shutil.move(speedup_filename, audio_path)
353
+ elif tts_duration < actual_duration:
354
+ # If TTS audio duration is less than actual duration, add silence to match the duration
355
+ silence_gap = actual_duration - tts_duration
356
+ silence = AudioSegment.silent(duration=int(silence_gap))
357
+ new_audio = tts_audio + silence
358
+
359
+ # Save the new audio with added silence
360
+ new_audio.export(audio_path, format="wav")
361
+ else:
362
+ # If TTS audio duration is equal to actual duration, use the original TTS audio
363
+ shutil.move(tts_filename, audio_path)
364
+
365
+ @staticmethod
366
+ def make_silence(pause_time, pause_save_path):
367
+ silence = AudioSegment.silent(duration=pause_time)
368
+ silence.export(pause_save_path, format="wav")
369
+ return pause_save_path
370
+
371
+ @staticmethod
372
+ def create_folder_for_srt(srt_file_path):
373
+ srt_base_name = os.path.splitext(os.path.basename(srt_file_path))[0]
374
+ random_uuid = str(uuid.uuid4())[:4]
375
+ dummy_folder_path = f"{base_path}/dummy"
376
+ if not os.path.exists(dummy_folder_path):
377
+ os.makedirs(dummy_folder_path)
378
+ folder_path = os.path.join(dummy_folder_path, f"{srt_base_name}_{random_uuid}")
379
+ os.makedirs(folder_path, exist_ok=True)
380
+ return folder_path
381
+
382
+ @staticmethod
383
+ def concatenate_audio_files(audio_paths, output_path):
384
+ concatenated_audio = AudioSegment.silent(duration=0)
385
+ for audio_path in audio_paths:
386
+ audio_segment = AudioSegment.from_file(audio_path)
387
+ concatenated_audio += audio_segment
388
+ concatenated_audio.export(output_path, format="wav")
389
+
390
+ def srt_to_dub(self, srt_file_path,dub_save_path,language='en'):
391
+ result = self.read_srt_file(srt_file_path)
392
+ new_folder_path = self.create_folder_for_srt(srt_file_path)
393
+ join_path = []
394
+ for i in tqdm(result):
395
+ # for i in result:
396
+ text = i['text']
397
+ actual_duration = i['end_time'] - i['start_time']
398
+ pause_time = i['pause_time']
399
+ slient_path = f"{new_folder_path}/{i['previous_pause']}"
400
+ self.make_silence(pause_time, slient_path)
401
+ join_path.append(slient_path)
402
+ tts_path = f"{new_folder_path}/{i['audio_name']}"
403
+ self.text_to_speech_srt(text, tts_path, language, actual_duration)
404
+ join_path.append(tts_path)
405
+ self.concatenate_audio_files(join_path, dub_save_path)
406
+
407
+ @staticmethod
408
+ def convert_to_millisecond(time_str):
409
+ if isinstance(time_str, str):
410
+ hours, minutes, second_millisecond = time_str.split(':')
411
+ seconds, milliseconds = second_millisecond.split(",")
412
+
413
+ total_milliseconds = (
414
+ int(hours) * 3600000 +
415
+ int(minutes) * 60000 +
416
+ int(seconds) * 1000 +
417
+ int(milliseconds)
418
+ )
419
+
420
+ return total_milliseconds
421
+ @staticmethod
422
+ def read_srt_file(file_path):
423
+ entries = []
424
+ default_start = 0
425
+ previous_end_time = default_start
426
+ entry_number = 1
427
+ audio_name_template = "{}.wav"
428
+ previous_pause_template = "{}_before_pause.wav"
429
+
430
+ with open(file_path, 'r', encoding='utf-8') as file:
431
+ lines = file.readlines()
432
+ # print(lines)
433
+ for i in range(0, len(lines), 4):
434
+ time_info = re.findall(r'(\d+:\d+:\d+,\d+) --> (\d+:\d+:\d+,\d+)', lines[i + 1])
435
+ start_time = SRTDubbing.convert_to_millisecond(time_info[0][0])
436
+ end_time = SRTDubbing.convert_to_millisecond(time_info[0][1])
437
+
438
+ current_entry = {
439
+ 'entry_number': entry_number,
440
+ 'start_time': start_time,
441
+ 'end_time': end_time,
442
+ 'text': lines[i + 2].strip(),
443
+ 'pause_time': start_time - previous_end_time if entry_number != 1 else start_time - default_start,
444
+ 'audio_name': audio_name_template.format(entry_number),
445
+ 'previous_pause': previous_pause_template.format(entry_number),
446
+ }
447
+
448
+ entries.append(current_entry)
449
+ previous_end_time = end_time
450
+ entry_number += 1
451
+
452
+ with open("entries.json", "w") as file:
453
+ json.dump(entries, file, indent=4)
454
+ return entries
455
+ srt_voice_name="am_adam"
456
+ def srt_process(srt_file_path,voice_name,dest_language="en"):
457
+ global srt_voice_name
458
+ srt_voice_name=voice_name
459
+ srt_dubbing = SRTDubbing()
460
+ dub_save_path=get_subtitle_Dub_path(srt_file_path,dest_language)
461
+ srt_dubbing.srt_to_dub(srt_file_path,dub_save_path,dest_language)
462
+ return dub_save_path
463
+
464
+ #
465
+ # srt_file_path="./long.srt"
466
+ # dub_audio_path=srt_process(srt_file_path)
467
+ # print(f"Audio file saved at: {dub_audio_path}")
468
+
469
+
470
+
471
+ with gr.Blocks() as demo3:
472
+
473
+ gr.Markdown(
474
+ """
475
+ # Generate Audio File From Subtitle [Single Speaker Only]
476
+
477
+ To generate subtitles, you can use the [Whisper Turbo Subtitle](https://github.com/NeuralFalconYT/Whisper-Turbo-Subtitle)
478
+
479
+ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NeuralFalconYT/Whisper-Turbo-Subtitle/blob/main/Whisper_Turbo_Subtitle.ipynb)
480
+ """
481
+ )
482
+ with gr.Row():
483
+ with gr.Column():
484
+ srt_file = gr.File(label='Upload .srt Subtitle File Only')
485
+ with gr.Row():
486
+ voice = gr.Dropdown(
487
+ voice_list,
488
+ value='af',
489
+ allow_custom_value=False,
490
+ label='Voice',
491
+ )
492
+ with gr.Row():
493
+ generate_btn_ = gr.Button('Generate', variant='primary')
494
+
495
+ with gr.Column():
496
+ audio = gr.Audio(interactive=False, label='Output Audio', autoplay=True)
497
+ with gr.Accordion('Enable Autoplay', open=False):
498
+ autoplay = gr.Checkbox(value=True, label='Autoplay')
499
+ autoplay.change(toggle_autoplay, inputs=[autoplay], outputs=[audio])
500
+
501
+ # srt_file.submit(
502
+ # srt_process,
503
+ # inputs=[srt_file, voice],
504
+ # outputs=[audio]
505
+ # )
506
+ generate_btn_.click(
507
+ srt_process,
508
+ inputs=[srt_file,voice],
509
+ outputs=[audio]
510
+ )
511
+
512
+
513
+ display_text = " \n".join(voice_list)
514
+
515
+ with gr.Blocks() as demo4:
516
+ gr.Markdown(f"# Voice Names \n{display_text}")
517
+
518
+
519
+ import click
520
+ @click.command()
521
+ @click.option("--debug", is_flag=True, default=False, help="Enable debug mode.")
522
+ @click.option("--share", is_flag=True, default=False, help="Enable sharing of the interface.")
523
+ def main(debug, share):
524
+ demo = gr.TabbedInterface([demo1, demo2,demo3,demo4], ["Batched TTS", "Multiple Speech-Type Generation","SRT Dubbing","Available Voice Names"],title="Kokoro TTS")
525
+
526
+ demo.queue().launch(debug=debug, share=share)
527
+ #Run on local network
528
+ # laptop_ip="192.168.0.30"
529
+ # port=8080
530
+ # demo.queue().launch(debug=debug, share=share,server_name=laptop_ip,server_port=port)
531
+
532
+ if __name__ == "__main__":
533
+ main()
534
+
535
+
536
+ ##For client side
537
+ # from gradio_client import Client
538
+ # import shutil
539
+ # import os
540
+ # os.makedirs("temp_audio", exist_ok=True)
541
+ # from gradio_client import Client
542
+ # client = Client("http://127.0.0.1:7860/")
543
+ # result = client.predict(
544
+ # text="Hello!!",
545
+ # model_name="kokoro-v0_19.pth",
546
+ # voice_name="af_bella",
547
+ # speed=1,
548
+ # trim=0,
549
+ # pad_between_segments=0,
550
+ # remove_silence=False,
551
+ # minimum_silence=0.05,
552
+ # api_name="/text_to_speech"
553
+ # )
554
+
555
+ # save_at=f"./temp_audio/{os.path.basename(result)}"
556
+ # shutil.move(result, save_at)
557
+ # print(f"Saved at {save_at}")