File size: 12,749 Bytes
ab3c5d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf5ea2a
 
 
 
 
ab3c5d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de85bf8
ab3c5d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6789f9b
 
ab3c5d4
6789f9b
de85bf8
ab3c5d4
de85bf8
 
 
 
 
6789f9b
 
 
 
 
 
 
 
ab3c5d4
 
 
 
 
de85bf8
ab3c5d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
de85bf8
ab3c5d4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6789f9b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
from .kokoro import normalize_text,phonemize,generate
import re
import librosa
import os
import uuid
from pydub.silence import split_on_silence
from pydub import AudioSegment
import wave
import numpy as np
import torch


def create_audio_dir():
    """Creates the 'kokoro_audio' directory in the root folder if it doesn't exist."""
    root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    audio_dir = os.path.join(root_dir, "kokoro_audio")

    if not os.path.exists(audio_dir):
        os.makedirs(audio_dir)
        print(f"Created directory: {audio_dir}")
    else:
        print(f"Directory already exists: {audio_dir}")
    return audio_dir

temp_folder = create_audio_dir()


debug=False
def resplit_strings(arr):
    # Handle edge cases
    if not arr:
        return '', ''
    if len(arr) == 1:
        return arr[0], ''
    # Try each possible split point
    min_diff = float('inf')
    best_split = 0
    # Calculate lengths when joined with spaces
    lengths = [len(s) for s in arr]
    spaces = len(arr) - 1  # Total spaces needed
    # Try each split point
    left_len = 0
    right_len = sum(lengths) + spaces
    for i in range(1, len(arr)):
        # Add current word and space to left side
        left_len += lengths[i-1] + (1 if i > 1 else 0)
        # Remove current word and space from right side
        right_len -= lengths[i-1] + 1
        diff = abs(left_len - right_len)
        if diff < min_diff:
            min_diff = diff
            best_split = i
    # Join the strings with the best split point
    return ' '.join(arr[:best_split]), ' '.join(arr[best_split:])

def recursive_split(text, voice):
    if not text:
        return []
    tokens = phonemize(text, voice, norm=False)
    if len(tokens) < 511:
        return [(text, tokens, len(tokens))] if tokens else []
    if ' ' not in text:
        return []
    for punctuation in ['!.?…', ':;', ',—']:
        splits = re.split(f'(?:(?<=[{punctuation}])|(?<=[{punctuation}]["\'»])|(?<=[{punctuation}]["\'»]["\'»])) ', text)
        if len(splits) > 1:
            break
        else:
            splits = None
    splits = splits or text.split(' ')
    a, b = resplit_strings(splits)
    return recursive_split(a, voice) + recursive_split(b, voice)

def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2):    
    if skip_square_brackets:
        text = re.sub(r'\[.*?\]', '', text)
    texts = [t.strip() for t in re.split('\n{'+str(newline_split)+',}', normalize_text(text))] if newline_split > 0 else [normalize_text(text)]
    segments = [row for t in texts for row in recursive_split(t, voice)]
    return [(i, *row) for i, row in enumerate(segments)]


def large_text(text,VOICE_NAME):
    if len(text) <= 500:
        return [(0, text, len(text))]
    else:
        result=segment_and_tokenize(text, VOICE_NAME[0])
        filtered_result = [(row[0], row[1], row[3]) for row in result]
        return filtered_result
    

def clamp_speed(speed):
    if not isinstance(speed, float) and not isinstance(speed, int):
        return 1
    elif speed < 0.5:
        # return 0.5
        return speed
    elif speed > 2:
        return 2
    return speed

def clamp_trim(trim):
    if not isinstance(trim, float) and not isinstance(trim, int):
        return 0.5
    elif trim <= 0:
        return 0
    elif trim > 1:
        return 0.5
    return trim

def trim_if_needed(out, trim):
    if not trim:
        return out
    a, b = librosa.effects.trim(out, top_db=30)[1]
    a = int(a*trim)
    b = int(len(out)-(len(out)-b)*trim)
    return out[a:b]    

#Above code copied from https://huggingface.co./spaces/hexgrad/Kokoro-TTS/blob/main/app.py

def get_random_file_name(output_file=""):
    global temp_folder
    if output_file=="":
        random_id = str(uuid.uuid4())[:8]
        output_file = f"{temp_folder}/{random_id}.wav"
        return output_file
    # Ensure temp_folder exists 
    if not os.path.exists(output_file):
        return output_file   
    try:
        if output_file and os.path.exists(output_file):
            os.remove(output_file)  # Try to remove the file if it exists
            return output_file      # Return the same name if the file was successfully removed
    except Exception as e:
        # print(f"Error removing file {output_file}: {e}")
        random_id = str(uuid.uuid4())[:8]
        output_file = f"{temp_folder}/{random_id}.wav"
        return output_file
    

def remove_silence_function(file_path,minimum_silence=50):
    # Extract file name and format from the provided path
    output_path = file_path.replace(".wav", "_no_silence.wav")
    audio_format = "wav"
    # Reading and splitting the audio file into chunks
    sound = AudioSegment.from_file(file_path, format=audio_format)
    audio_chunks = split_on_silence(sound,
                                    min_silence_len=100,
                                    silence_thresh=-45,
                                    keep_silence=minimum_silence) 
    # Putting the file back together
    combined = AudioSegment.empty()
    for chunk in audio_chunks:
        combined += chunk
    combined.export(output_path, format=audio_format)
    return output_path

# import simpleaudio as sa
# def play_audio(filename):
#     wave_obj = sa.WaveObject.from_wave_file(filename)
#     play_obj = wave_obj.play()
#     play_obj.wait_done()


import re

def clean_text(text):
    # Define replacement rules
    replacements = {
        "–": " ",  # Replace en-dash with space
        "-": " ",  # Replace hyphen with space
        ":": ",",  # Replace colon with comma
        "**": " ", # Replace double asterisks with space
        "*": " ",  # Replace single asterisk with space
        "#": " ",  # Replace hash with space
    }

    # Apply replacements
    for old, new in replacements.items():
        text = text.replace(old, new)

    # Remove emojis using regex (covering wide range of Unicode characters)
    emoji_pattern = re.compile(
        r'[\U0001F600-\U0001F64F]|'  # Emoticons
        r'[\U0001F300-\U0001F5FF]|'  # Miscellaneous symbols and pictographs
        r'[\U0001F680-\U0001F6FF]|'  # Transport and map symbols
        r'[\U0001F700-\U0001F77F]|'  # Alchemical symbols
        r'[\U0001F780-\U0001F7FF]|'  # Geometric shapes extended
        r'[\U0001F800-\U0001F8FF]|'  # Supplemental arrows-C
        r'[\U0001F900-\U0001F9FF]|'  # Supplemental symbols and pictographs
        r'[\U0001FA00-\U0001FA6F]|'  # Chess symbols
        r'[\U0001FA70-\U0001FAFF]|'  # Symbols and pictographs extended-A
        r'[\U00002702-\U000027B0]|'  # Dingbats
        r'[\U0001F1E0-\U0001F1FF]'   # Flags (iOS)
        r'', flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    # Remove multiple spaces and extra line breaks
    text = re.sub(r'\s+', ' ', text).strip()

    return text

#copied from F5TTS 😁
import re
def parse_speechtypes_text(gen_text):
    # Pattern to find {speechtype}
    pattern = r"\{(.*?)\}"

    # Split the text by the pattern
    tokens = re.split(pattern, gen_text)

    segments = []

    current_style = "af"

    for i in range(len(tokens)):
        if i % 2 == 0:
            # This is text
            text = tokens[i].strip()
            if text:
                text=clean_text(text)
                segments.append({"voice_name": current_style, "text": text})
        else:
            # This is style
            style = tokens[i].strip()
            current_style = style

    return segments

def podcast(MODEL, device, gen_text, speed=1.0, trim=0.5, pad_between_segments=0, remove_silence=True, minimum_silence=50):
    segments = parse_speechtypes_text(gen_text)
    speed = clamp_speed(speed)
    trim = clamp_trim(trim)
    silence_duration = clamp_trim(pad_between_segments)
    # output_file = get_random_file_name(output_file)
    sample_rate = 24000  # Sample rate of the audio

    # Create a silent audio segment in float32
    silence = np.zeros(int(sample_rate * silence_duration), dtype=np.float32)
    if len(segments)>=1:
        first_line_text=segments[0]["text"]
        output_file=tts_file_name(first_line_text)
    else:
        output_file = get_random_file_name("")
    
    output_file = output_file.replace('\n', '').replace('\r', '')
    # Open a WAV file for writing
    with wave.open(output_file, 'wb') as wav_file:
        wav_file.setnchannels(1)  # Mono
        wav_file.setsampwidth(2)  # 16-bit audio
        wav_file.setframerate(sample_rate)

        for idx, segment in enumerate(segments):  # Added index `idx` to track position
            voice_name = segment["voice_name"]
            text = segment["text"]
            voice_pack_path = f"./KOKORO/voices/{voice_name}.pt"
            VOICEPACK = torch.load(voice_pack_path, weights_only=True).to(device)

            # Generate audio for the segment
            audio, out_ps = generate(MODEL, text, VOICEPACK, lang=voice_name[0], speed=speed)
            audio = trim_if_needed(audio, trim)

            # Scale audio from float32 to int16
            audio = (audio * 32767).astype(np.int16)

            # Write the audio segment to the WAV file
            wav_file.writeframes(audio.tobytes())

            # Add silence between segments, except after the last segment
            if idx != len(segments) - 1:
                wav_file.writeframes((silence * 32767).astype(np.int16).tobytes())

    # Optionally remove silence from the output file
    if remove_silence:
        output_file = remove_silence_function(output_file, minimum_silence=minimum_silence)

    return output_file
old_voice_pack_path=""
old_VOICEPACK=None
def tts(MODEL,device,text, voice_name, speed=1.0, trim=0.5, pad_between_segments=0.5, output_file="",remove_silence=True,minimum_silence=50):
    global old_voice_pack_path,old_VOICEPACK
    language = voice_name[0]
    voice_pack_path = f"./KOKORO/voices/{voice_name}.pt"
    if voice_name.endswith(".pt"):
        language="a"
        voice_pack_path=voice_name
    text=clean_text(text)
    segments = large_text(text, language)
    if (old_voice_pack_path!=voice_pack_path)or ("weighted_normalised_voices.pt" in voice_pack_path):
        VOICEPACK = torch.load(voice_pack_path, weights_only=True).to(device)
        old_voice_pack_path=voice_pack_path
        old_VOICEPACK=VOICEPACK
        # print("Loaded new voice pack")
    else:
        VOICEPACK=old_VOICEPACK
        # print("Using old voice pack")
    speed = clamp_speed(speed)
    trim = clamp_trim(trim)
    silence_duration = clamp_trim(pad_between_segments)
    output_file=get_random_file_name(output_file)
    if debug:
        print(f'Loaded voice: {voice_pack_path}')
        print(f"Speed: {speed}")
        print(f"Trim: {trim}")
        print(f"Silence duration: {silence_duration}")
    sample_rate = 24000  # Sample rate of the audio

    # Create a silent audio segment in float32
    silence = np.zeros(int(sample_rate * silence_duration), dtype=np.float32)

    # Open a WAV file for writing
    with wave.open(output_file, 'wb') as wav_file:
        wav_file.setnchannels(1)  # Mono
        wav_file.setsampwidth(2)  # 16-bit audio
        wav_file.setframerate(sample_rate)

        for i in segments:
            id = i[0]
            text = i[1]
            if debug:
                print(i)
            audio, out_ps = generate(MODEL, text, VOICEPACK, lang=language, speed=speed)
            audio = trim_if_needed(audio, trim)

            # Scale audio from float32 to int16
            audio = (audio * 32767).astype(np.int16)

            # Write the audio segment to the WAV file
            wav_file.writeframes(audio.tobytes())
            
            # Add silence between segments, except after the last segment
            if id != len(segments) - 1:
                wav_file.writeframes((silence * 32767).astype(np.int16).tobytes())
    if remove_silence:
        output_file=remove_silence_function(output_file,minimum_silence=minimum_silence)
    return output_file



def tts_file_name(text):
    global temp_folder
    # Remove all non-alphabetic characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Retain only alphabets and spaces
    text = text.lower().strip()             # Convert to lowercase and strip leading/trailing spaces
    text = text.replace(" ", "_")           # Replace spaces with underscores
    
    # Truncate or handle empty text
    truncated_text = text[:25] if len(text) > 25 else text if len(text) > 0 else "empty"
    
    # Generate a random string for uniqueness
    random_string = uuid.uuid4().hex[:8].upper()
    
    # Construct the file name
    file_name = f"{temp_folder}/{truncated_text}_{random_string}.wav"
    return file_name