File size: 7,334 Bytes
77eabea 6ec69c0 7f5b6cf 6ec69c0 7f5b6cf 23f3f75 7f5b6cf 310b1cd 7f5b6cf 23f3f75 88b4f72 24b7a14 23f3f75 88b4f72 23f3f75 6ec69c0 7f5b6cf 6ec69c0 7f5b6cf 23f3f75 7f5b6cf 51c9037 7f5b6cf 77eabea 0a52a3b 7a3e6dc dd328cf 7a3e6dc 0a52a3b 6ec69c0 23f3f75 a496e5c 88b4f72 23f3f75 6ec69c0 23f3f75 6ec69c0 23f3f75 7f5b6cf 23f3f75 88b4f72 23f3f75 7f5b6cf 23f3f75 6ec69c0 0a52a3b 23f3f75 544cb68 23f3f75 310b1cd 0a52a3b 7f5b6cf 544cb68 6ec69c0 310b1cd 0a52a3b 347bb89 310b1cd 8a94ca8 23f3f75 310b1cd 8a94ca8 6ec69c0 7a3e6dc b764f80 7a3e6dc 545b263 7a3e6dc 545b263 7a3e6dc 545b263 1df68cf 545b263 7a3e6dc 310b1cd 6ec69c0 b764f80 6ec69c0 a94ede6 544cb68 0a52a3b 6ec69c0 7f5b6cf 6ec69c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 |
import spaces
import numpy as np
import gradio as gr
import torch
from transformers import MarianTokenizer, MarianMTModel, AutoTokenizer, AutoFeatureExtractor
from parler_tts import ParlerTTSForConditionalGeneration
from PyPDF2 import PdfReader
import re
import textwrap
import soundfile as sf
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Initialize models and tokenizers
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
feature_extractor = AutoFeatureExtractor.from_pretrained("parler-tts/parler-tts-mini-v1")
SAMPLE_RATE = feature_extractor.sampling_rate
SEED = 42
# Helper function to extract text from a PDF
def pdf_to_text(pdf_file):
with open(pdf_file, 'rb') as file:
pdf_reader = PdfReader(file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text() or ""
return text
# Helper function to split text into sentences using regex
def split_text_into_sentences(text):
sentence_endings = re.compile(r'[.!?]')
sentences = sentence_endings.split(text)
return [sentence.strip() for sentence in sentences if sentence.strip()]
@spaces.GPU(duration=120)
# Translation function
def translate(source_text, source_lang, target_lang, batch_size=16):
if source_lang == 'en' and target_lang == 'tr':
model_name = f"Helsinki-NLP/opus-mt-tc-big-en-tr"
else:
model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name).to(device)
text_chunks = textwrap.wrap(source_text, 512)
translated_text = ""
for i in range(0, len(text_chunks), batch_size):
text_batch = text_chunks[i:i+batch_size]
input_ids = tokenizer(text_batch, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
output_ids = model.generate(input_ids, max_new_tokens=512)
for output in output_ids:
output_text = tokenizer.decode(output, skip_special_tokens=True)
translated_text += output_text + " "
return translated_text
# Function to combine audio arrays
def combine_audio_arrays(audio_list):
combined_audio = np.concatenate(audio_list, axis=0)
return combined_audio
@spaces.GPU(duration=35)
# Function to generate audio for a single sentence
def generate_single_wav_from_text(sentence, description):
torch.manual_seed(SEED)
inputs = tts_tokenizer(description.strip(), return_tensors="pt").to(device)
prompt = tts_tokenizer(sentence, return_tensors="pt").to(device)
generation = tts_model.generate(
input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask,
prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
)
audio_arr = generation.cpu().numpy().squeeze()
return SAMPLE_RATE, audio_arr
# Gradio Interface
with gr.Blocks() as demo:
with gr.Row():
with gr.Column():
input_mode = gr.Radio(choices=["Upload PDF", "Type Text"], label="Input Mode", value="Type Text")
pdf_input = gr.File(label="Upload PDF", file_types=['pdf'], visible=False)
text_input = gr.Textbox(label="Type your text here", visible=True, placeholder="Enter text here if not uploading a PDF...")
translate_checkbox = gr.Checkbox(label="Enable Translation", value=False)
source_lang = gr.Dropdown(choices=["en", "tr", "de", "fr"], label="Source Language", value="en", interactive=True)
target_lang = gr.Dropdown(choices=["tr"], label="Target Language", value="tr", interactive=True)
description = gr.Textbox(label="Voice Description", lines=2,
value="Gary's voice is monotone yet slightly fast in delivery, with a very close recording that has no background noise.")
run_button = gr.Button("Generate Audio", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Generated Audio")
markdown_output = gr.Markdown()
def update_target_lang(source_lang):
options = {
"en": ["de", "fr", "tr"],
"tr": ["en"],
"de": ["en", "fr"],
"fr": ["en", "de"]
}
return gr.update(choices=options[source_lang], value=options[source_lang][0])
def handle_input(input_mode, pdf_input, text_input):
if input_mode == "Upload PDF":
return pdf_to_text(pdf_input.name)
else:
return text_input
def run_pipeline(input_mode, pdf_input, text_input, translate_checkbox, source_lang, target_lang, description):
text = handle_input(input_mode, pdf_input, text_input)
if translate_checkbox:
text = translate(text, source_lang, target_lang)
sentences = split_text_into_sentences(text)
all_audio = []
all_text = ""
for sentence in sentences:
sample_rate, audio_arr = generate_single_wav_from_text(sentence, description)
all_audio.append(audio_arr)
combined_audio = combine_audio_arrays(all_audio)
all_text += f"**Sentence**: {sentence}\n\n"
yield (sample_rate, combined_audio), all_text
examples = [
[
"Type Text", # Example for text input mode
None, # No PDF
"Once upon a time, in the depth of winter, when the flakes of snow fell like feathers from the clouds, a queen sat sewing at her palace window, which had a carved frame of black wood.",
False, # Translation not enabled
"en", # Source language
"tr", # Target language
"In an inferior recording quality, a female speaker delivers her slightly expressive and animated words with a fast pace. There's a high level of background noise and a very distant-sounding reverberation. Her voice is slightly higher pitched than average."
],
[
"Upload PDF", # Example for PDF input mode
"Ethics.pdf", # PDF name
None, # No direct text input
False, # Translation not enabled
"en", # Source language
"tr", # Target language
"Gary's voice is monotone yet slightly fast in delivery, with a very close recording that has no background noise."
]
]
input_mode.change(
fn=lambda choice: [gr.update(visible=choice == "Upload PDF"), gr.update(visible=choice == "Type Text")],
inputs=input_mode,
outputs=[pdf_input, text_input],
)
gr.Examples(examples=examples, fn=run_pipeline, inputs=[input_mode, pdf_input, text_input, translate_checkbox, source_lang, target_lang, description], outputs=[audio_output, markdown_output], cache_examples=False)
source_lang.change(update_target_lang, inputs=source_lang, outputs=target_lang)
run_button.click(run_pipeline, inputs=[input_mode, pdf_input, text_input, translate_checkbox, source_lang, target_lang, description], outputs=[audio_output, markdown_output])
demo.launch(share=True)
|