Spaces:

NeuralFalcon
/

Kokoro-TTS

Running

File size: 12,749 Bytes

from .kokoro import normalize_text,phonemize,generate
import re
import librosa
import os
import uuid
from pydub.silence import split_on_silence
from pydub import AudioSegment
import wave
import numpy as np
import torch


def create_audio_dir():
    """Creates the 'kokoro_audio' directory in the root folder if it doesn't exist."""
    root_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    audio_dir = os.path.join(root_dir, "kokoro_audio")

    if not os.path.exists(audio_dir):
        os.makedirs(audio_dir)
        print(f"Created directory: {audio_dir}")
    else:
        print(f"Directory already exists: {audio_dir}")
    return audio_dir

temp_folder = create_audio_dir()


debug=False
def resplit_strings(arr):
    # Handle edge cases
    if not arr:
        return '', ''
    if len(arr) == 1:
        return arr[0], ''
    # Try each possible split point
    min_diff = float('inf')
    best_split = 0
    # Calculate lengths when joined with spaces
    lengths = [len(s) for s in arr]
    spaces = len(arr) - 1  # Total spaces needed
    # Try each split point
    left_len = 0
    right_len = sum(lengths) + spaces
    for i in range(1, len(arr)):
        # Add current word and space to left side
        left_len += lengths[i-1] + (1 if i > 1 else 0)
        # Remove current word and space from right side
        right_len -= lengths[i-1] + 1
        diff = abs(left_len - right_len)
        if diff < min_diff:
            min_diff = diff
            best_split = i
    # Join the strings with the best split point
    return ' '.join(arr[:best_split]), ' '.join(arr[best_split:])

def recursive_split(text, voice):
    if not text:
        return []
    tokens = phonemize(text, voice, norm=False)
    if len(tokens) < 511:
        return [(text, tokens, len(tokens))] if tokens else []
    if ' ' not in text:
        return []
    for punctuation in ['!.?…', ':;', ',—']:
        splits = re.split(f'(?:(?<=[{punctuation}])|(?<=[{punctuation}]["\'»])|(?<=[{punctuation}]["\'»]["\'»])) ', text)
        if len(splits) > 1:
            break
        else:
            splits = None
    splits = splits or text.split(' ')
    a, b = resplit_strings(splits)
    return recursive_split(a, voice) + recursive_split(b, voice)

def segment_and_tokenize(text, voice, skip_square_brackets=True, newline_split=2):    
    if skip_square_brackets:
        text = re.sub(r'\[.*?\]', '', text)
    texts = [t.strip() for t in re.split('\n{'+str(newline_split)+',}', normalize_text(text))] if newline_split > 0 else [normalize_text(text)]
    segments = [row for t in texts for row in recursive_split(t, voice)]
    return [(i, *row) for i, row in enumerate(segments)]


def large_text(text,VOICE_NAME):
    if len(text) <= 500:
        return [(0, text, len(text))]
    else:
        result=segment_and_tokenize(text, VOICE_NAME[0])
        filtered_result = [(row[0], row[1], row[3]) for row in result]
        return filtered_result
    

def clamp_speed(speed):
    if not isinstance(speed, float) and not isinstance(speed, int):
        return 1
    elif speed < 0.5:
        # return 0.5
        return speed
    elif speed > 2:
        return 2
    return speed

def clamp_trim(trim):
    if not isinstance(trim, float) and not isinstance(trim, int):
        return 0.5
    elif trim <= 0:
        return 0
    elif trim > 1:
        return 0.5
    return trim

def trim_if_needed(out, trim):
    if not trim:
        return out
    a, b = librosa.effects.trim(out, top_db=30)[1]
    a = int(a*trim)
    b = int(len(out)-(len(out)-b)*trim)
    return out[a:b]    

#Above code copied from https://huggingface.co./spaces/hexgrad/Kokoro-TTS/blob/main/app.py

def get_random_file_name(output_file=""):
    global temp_folder
    if output_file=="":
        random_id = str(uuid.uuid4())[:8]
        output_file = f"{temp_folder}/{random_id}.wav"
        return output_file
    # Ensure temp_folder exists 
    if not os.path.exists(output_file):
        return output_file   
    try:
        if output_file and os.path.exists(output_file):
            os.remove(output_file)  # Try to remove the file if it exists
            return output_file      # Return the same name if the file was successfully removed
    except Exception as e:
        # print(f"Error removing file {output_file}: {e}")
        random_id = str(uuid.uuid4())[:8]
        output_file = f"{temp_folder}/{random_id}.wav"
        return output_file
    

def remove_silence_function(file_path,minimum_silence=50):
    # Extract file name and format from the provided path
    output_path = file_path.replace(".wav", "_no_silence.wav")
    audio_format = "wav"
    # Reading and splitting the audio file into chunks
    sound = AudioSegment.from_file(file_path, format=audio_format)
    audio_chunks = split_on_silence(sound,
                                    min_silence_len=100,
                                    silence_thresh=-45,
                                    keep_silence=minimum_silence) 
    # Putting the file back together
    combined = AudioSegment.empty()
    for chunk in audio_chunks:
        combined += chunk
    combined.export(output_path, format=audio_format)
    return output_path

# import simpleaudio as sa
# def play_audio(filename):
#     wave_obj = sa.WaveObject.from_wave_file(filename)
#     play_obj = wave_obj.play()
#     play_obj.wait_done()


import re

def clean_text(text):
    # Define replacement rules
    replacements = {
        "–": " ",  # Replace en-dash with space
        "-": " ",  # Replace hyphen with space
        ":": ",",  # Replace colon with comma
        "**": " ", # Replace double asterisks with space
        "*": " ",  # Replace single asterisk with space
        "#": " ",  # Replace hash with space
    }

    # Apply replacements
    for old, new in replacements.items():
        text = text.replace(old, new)

    # Remove emojis using regex (covering wide range of Unicode characters)
    emoji_pattern = re.compile(
        r'[\U0001F600-\U0001F64F]|'  # Emoticons
        r'[\U0001F300-\U0001F5FF]|'  # Miscellaneous symbols and pictographs
        r'[\U0001F680-\U0001F6FF]|'  # Transport and map symbols
        r'[\U0001F700-\U0001F77F]|'  # Alchemical symbols
        r'[\U0001F780-\U0001F7FF]|'  # Geometric shapes extended
        r'[\U0001F800-\U0001F8FF]|'  # Supplemental arrows-C
        r'[\U0001F900-\U0001F9FF]|'  # Supplemental symbols and pictographs
        r'[\U0001FA00-\U0001FA6F]|'  # Chess symbols
        r'[\U0001FA70-\U0001FAFF]|'  # Symbols and pictographs extended-A
        r'[\U00002702-\U000027B0]|'  # Dingbats
        r'[\U0001F1E0-\U0001F1FF]'   # Flags (iOS)
        r'', flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)

    # Remove multiple spaces and extra line breaks
    text = re.sub(r'\s+', ' ', text).strip()

    return text

#copied from F5TTS 😁
import re
def parse_speechtypes_text(gen_text):
    # Pattern to find {speechtype}
    pattern = r"\{(.*?)\}"

    # Split the text by the pattern
    tokens = re.split(pattern, gen_text)

    segments = []

    current_style = "af"

    for i in range(len(tokens)):
        if i % 2 == 0:
            # This is text
            text = tokens[i].strip()
            if text:
                text=clean_text(text)
                segments.append({"voice_name": current_style, "text": text})
        else:
            # This is style
            style = tokens[i].strip()
            current_style = style

    return segments

def podcast(MODEL, device, gen_text, speed=1.0, trim=0.5, pad_between_segments=0, remove_silence=True, minimum_silence=50):
    segments = parse_speechtypes_text(gen_text)
    speed = clamp_speed(speed)
    trim = clamp_trim(trim)
    silence_duration = clamp_trim(pad_between_segments)
    # output_file = get_random_file_name(output_file)
    sample_rate = 24000  # Sample rate of the audio

    # Create a silent audio segment in float32
    silence = np.zeros(int(sample_rate * silence_duration), dtype=np.float32)
    if len(segments)>=1:
        first_line_text=segments[0]["text"]
        output_file=tts_file_name(first_line_text)
    else:
        output_file = get_random_file_name("")
    
    output_file = output_file.replace('\n', '').replace('\r', '')
    # Open a WAV file for writing
    with wave.open(output_file, 'wb') as wav_file:
        wav_file.setnchannels(1)  # Mono
        wav_file.setsampwidth(2)  # 16-bit audio
        wav_file.setframerate(sample_rate)

        for idx, segment in enumerate(segments):  # Added index `idx` to track position
            voice_name = segment["voice_name"]
            text = segment["text"]
            voice_pack_path = f"./KOKORO/voices/{voice_name}.pt"
            VOICEPACK = torch.load(voice_pack_path, weights_only=True).to(device)

            # Generate audio for the segment
            audio, out_ps = generate(MODEL, text, VOICEPACK, lang=voice_name[0], speed=speed)
            audio = trim_if_needed(audio, trim)

            # Scale audio from float32 to int16
            audio = (audio * 32767).astype(np.int16)

            # Write the audio segment to the WAV file
            wav_file.writeframes(audio.tobytes())

            # Add silence between segments, except after the last segment
            if idx != len(segments) - 1:
                wav_file.writeframes((silence * 32767).astype(np.int16).tobytes())

    # Optionally remove silence from the output file
    if remove_silence:
        output_file = remove_silence_function(output_file, minimum_silence=minimum_silence)

    return output_file
old_voice_pack_path=""
old_VOICEPACK=None
def tts(MODEL,device,text, voice_name, speed=1.0, trim=0.5, pad_between_segments=0.5, output_file="",remove_silence=True,minimum_silence=50):
    global old_voice_pack_path,old_VOICEPACK
    language = voice_name[0]
    voice_pack_path = f"./KOKORO/voices/{voice_name}.pt"
    if voice_name.endswith(".pt"):
        language="a"
        voice_pack_path=voice_name
    text=clean_text(text)
    segments = large_text(text, language)
    if (old_voice_pack_path!=voice_pack_path)or ("weighted_normalised_voices.pt" in voice_pack_path):
        VOICEPACK = torch.load(voice_pack_path, weights_only=True).to(device)
        old_voice_pack_path=voice_pack_path
        old_VOICEPACK=VOICEPACK
        # print("Loaded new voice pack")
    else:
        VOICEPACK=old_VOICEPACK
        # print("Using old voice pack")
    speed = clamp_speed(speed)
    trim = clamp_trim(trim)
    silence_duration = clamp_trim(pad_between_segments)
    output_file=get_random_file_name(output_file)
    if debug:
        print(f'Loaded voice: {voice_pack_path}')
        print(f"Speed: {speed}")
        print(f"Trim: {trim}")
        print(f"Silence duration: {silence_duration}")
    sample_rate = 24000  # Sample rate of the audio

    # Create a silent audio segment in float32
    silence = np.zeros(int(sample_rate * silence_duration), dtype=np.float32)

    # Open a WAV file for writing
    with wave.open(output_file, 'wb') as wav_file:
        wav_file.setnchannels(1)  # Mono
        wav_file.setsampwidth(2)  # 16-bit audio
        wav_file.setframerate(sample_rate)

        for i in segments:
            id = i[0]
            text = i[1]
            if debug:
                print(i)
            audio, out_ps = generate(MODEL, text, VOICEPACK, lang=language, speed=speed)
            audio = trim_if_needed(audio, trim)

            # Scale audio from float32 to int16
            audio = (audio * 32767).astype(np.int16)

            # Write the audio segment to the WAV file
            wav_file.writeframes(audio.tobytes())
            
            # Add silence between segments, except after the last segment
            if id != len(segments) - 1:
                wav_file.writeframes((silence * 32767).astype(np.int16).tobytes())
    if remove_silence:
        output_file=remove_silence_function(output_file,minimum_silence=minimum_silence)
    return output_file



def tts_file_name(text):
    global temp_folder
    # Remove all non-alphabetic characters and convert to lowercase
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Retain only alphabets and spaces
    text = text.lower().strip()             # Convert to lowercase and strip leading/trailing spaces
    text = text.replace(" ", "_")           # Replace spaces with underscores
    
    # Truncate or handle empty text
    truncated_text = text[:25] if len(text) > 25 else text if len(text) > 0 else "empty"
    
    # Generate a random string for uniqueness
    random_string = uuid.uuid4().hex[:8].upper()
    
    # Construct the file name
    file_name = f"{temp_folder}/{truncated_text}_{random_string}.wav"
    return file_name