initial commits
Browse files- app.py +144 -0
- requirements.txt +10 -0
app.py
ADDED
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import torch
|
3 |
+
from transformers import MarianTokenizer, MarianMTModel
|
4 |
+
from pdf2docx import Converter
|
5 |
+
from docx import Document
|
6 |
+
from parler_tts import ParlerTTSForConditionalGeneration
|
7 |
+
from transformers import AutoTokenizer
|
8 |
+
import soundfile as sf
|
9 |
+
from pydub import AudioSegment
|
10 |
+
import os
|
11 |
+
import nltk
|
12 |
+
from PyPDF2 import PdfReader
|
13 |
+
import textwrap
|
14 |
+
|
15 |
+
# Download the punkt tokenizer for sentence splitting
|
16 |
+
nltk.download('punkt')
|
17 |
+
|
18 |
+
# Device configuration
|
19 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
20 |
+
|
21 |
+
# Translation function
|
22 |
+
def translate(source_text, source_lang, target_lang, batch_size=16):
|
23 |
+
model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
|
24 |
+
|
25 |
+
tokenizer = MarianTokenizer.from_pretrained(model_name)
|
26 |
+
model = MarianMTModel.from_pretrained(model_name).to(device)
|
27 |
+
|
28 |
+
text_chunks = textwrap.wrap(source_text, 512)
|
29 |
+
translated_text = ""
|
30 |
+
|
31 |
+
for i in range(0, len(text_chunks), batch_size):
|
32 |
+
text_batch = text_chunks[i:i+batch_size]
|
33 |
+
input_ids = tokenizer(text_batch, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
|
34 |
+
output_ids = model.generate(input_ids, max_new_tokens=512)
|
35 |
+
|
36 |
+
for output in output_ids:
|
37 |
+
output_text = tokenizer.decode(output, skip_special_tokens=True)
|
38 |
+
translated_text += output_text + " "
|
39 |
+
|
40 |
+
return translated_text
|
41 |
+
|
42 |
+
# Function to extract text from PDF
|
43 |
+
def pdf_to_text(pdf_path):
|
44 |
+
with open(pdf_path, 'rb') as file:
|
45 |
+
pdf_reader = PdfReader(file)
|
46 |
+
text = ""
|
47 |
+
for page_num in range(len(pdf_reader.pages)):
|
48 |
+
page = pdf_reader.pages[page_num]
|
49 |
+
text += page.extract_text()
|
50 |
+
return text
|
51 |
+
|
52 |
+
# Load TTS model and tokenizer
|
53 |
+
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
|
54 |
+
tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
|
55 |
+
|
56 |
+
# Function to split text into sentences
|
57 |
+
def split_text_into_sentences(text):
|
58 |
+
sentences = nltk.sent_tokenize(text)
|
59 |
+
return sentences
|
60 |
+
|
61 |
+
# Function to generate audio from text
|
62 |
+
def generate_wav_from_text(prompt, description, output_file_prefix):
|
63 |
+
input_ids = tts_tokenizer(prompt, return_tensors="pt").input_ids.to(device)
|
64 |
+
prompt_input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(device)
|
65 |
+
|
66 |
+
generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
67 |
+
audio_arr = generation.cpu().numpy().squeeze()
|
68 |
+
output_file = f"{output_file_prefix}.wav"
|
69 |
+
sf.write(output_file, audio_arr, tts_model.config.sampling_rate)
|
70 |
+
return output_file
|
71 |
+
|
72 |
+
# Function to combine audio files
|
73 |
+
def combine_wav_files(output_file, *input_files, silence_duration=500):
|
74 |
+
combined = AudioSegment.empty()
|
75 |
+
one_second_silence = AudioSegment.silent(duration=silence_duration)
|
76 |
+
|
77 |
+
for file in input_files:
|
78 |
+
audio = AudioSegment.from_wav(file)
|
79 |
+
combined += audio + one_second_silence
|
80 |
+
|
81 |
+
combined.export(output_file, format='wav')
|
82 |
+
|
83 |
+
# Function to update target language options based on the source language
|
84 |
+
def update_target_lang_options(source_lang):
|
85 |
+
options = {
|
86 |
+
"en": ["de", "fr", "tr"],
|
87 |
+
"tr": ["en"],
|
88 |
+
"de": ["en", "fr"],
|
89 |
+
"fr": ["en", "de"]
|
90 |
+
}
|
91 |
+
return gr.update(choices=options.get(source_lang, []), value=options.get(source_lang, [])[0])
|
92 |
+
|
93 |
+
# Main Gradio function
|
94 |
+
def process_pdf(pdf_file, translate_checkbox, source_lang, target_lang, description):
|
95 |
+
text = pdf_to_text(pdf_file.name)
|
96 |
+
|
97 |
+
# Translate if translation checkbox is selected
|
98 |
+
if translate_checkbox:
|
99 |
+
text = translate(text, source_lang, target_lang)
|
100 |
+
|
101 |
+
sentences = split_text_into_sentences(text)
|
102 |
+
audio_files = []
|
103 |
+
outputs = []
|
104 |
+
|
105 |
+
for i, sentence in enumerate(sentences):
|
106 |
+
output_file_prefix = f"sentence_{i+1}"
|
107 |
+
audio_file = generate_wav_from_text(sentence, description, output_file_prefix)
|
108 |
+
audio_files.append(audio_file)
|
109 |
+
outputs.append((sentence, audio_file))
|
110 |
+
|
111 |
+
combined_output_file = "sentences_combined.wav"
|
112 |
+
combine_wav_files(combined_output_file, *audio_files)
|
113 |
+
|
114 |
+
return outputs, combined_output_file
|
115 |
+
|
116 |
+
# Gradio interface
|
117 |
+
with gr.Blocks() as demo:
|
118 |
+
with gr.Row():
|
119 |
+
with gr.Column(scale=1):
|
120 |
+
pdf_input = gr.File(label="Upload PDF", file_types=['pdf'])
|
121 |
+
translate_checkbox = gr.Checkbox(label="Enable Translation", value=False)
|
122 |
+
source_lang = gr.Dropdown(choices=["en", "tr", "de", "fr"], label="Source Language", value="en", interactive=True)
|
123 |
+
target_lang = gr.Dropdown(choices=["tr"], label="Target Language", value="tr", interactive=True)
|
124 |
+
description = gr.Textbox(label="Voice Description",
|
125 |
+
value="Old man voice. Monotone voice tune from an old man, with a very close recording that almost has no background noise.")
|
126 |
+
process_btn = gr.Button("Process")
|
127 |
+
with gr.Column(scale=2):
|
128 |
+
output = gr.Dataframe(headers=["Sentence", "Audio"], label="Generated Audio", datatype=["str", "audio"])
|
129 |
+
combined_audio = gr.Audio(label="Combined Audio with Silence", type="filepath")
|
130 |
+
|
131 |
+
def handle_process(pdf_input, translate_checkbox, source_lang, target_lang, description):
|
132 |
+
return process_pdf(pdf_input, translate_checkbox, source_lang, target_lang, description)
|
133 |
+
|
134 |
+
def handle_translation_toggle(translate_checkbox):
|
135 |
+
if translate_checkbox:
|
136 |
+
return gr.update(visible=True), gr.update(visible=True)
|
137 |
+
else:
|
138 |
+
return gr.update(visible=False), gr.update(visible=False)
|
139 |
+
|
140 |
+
translate_checkbox.change(fn=handle_translation_toggle, inputs=translate_checkbox, outputs=[source_lang, target_lang])
|
141 |
+
source_lang.change(fn=update_target_lang_options, inputs=source_lang, outputs=target_lang)
|
142 |
+
process_btn.click(handle_process, inputs=[pdf_input, translate_checkbox, source_lang, target_lang, description], outputs=[output, combined_audio])
|
143 |
+
|
144 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio>=4.0.0
|
2 |
+
torch>=1.9.0
|
3 |
+
transformers>=4.11.3
|
4 |
+
sentencepiece
|
5 |
+
pdf2docx
|
6 |
+
python-docx
|
7 |
+
PyPDF2
|
8 |
+
pydub
|
9 |
+
soundfile
|
10 |
+
nltk
|