Spaces:

emirhanbilgic
/

read-my-pdf-outloud

Running

App Files Files Community

emirhanbilgic commited on Aug 11, 2024

Commit

0a52a3b

verified ·

1 Parent(s): c713231

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -9

app.py CHANGED Viewed

@@ -34,7 +34,29 @@ def split_text_into_sentences(text):
     sentences = sentence_endings.split(text)
     return [sentence.strip() for sentence in sentences if sentence.strip()]
-# Helper function to preprocess the text (normalization, punctuation)
 def preprocess(text):
     text = text.replace("-", " ")
     if text[-1] not in ".!?":
@@ -53,8 +75,6 @@ def generate_single_wav_from_text(sentence, description):
         prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
     )
     audio_arr = generation.cpu().numpy().squeeze()
-    output_file = f"sentence.wav"
-    sf.write(output_file, audio_arr, SAMPLE_RATE)
     return SAMPLE_RATE, audio_arr
 # Gradio Interface
@@ -62,28 +82,45 @@ with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
             pdf_input = gr.File(label="Upload PDF", file_types=['pdf'])
             description = gr.Textbox(label="Voice Description", lines=2,
                                      value="Old man voice. Monotone voice tune from an old man, with a very close recording that almost has no background noise.")
             run_button = gr.Button("Generate Audio", variant="primary")
         with gr.Column():
             audio_output = gr.Audio(label="Generated Audio")
-    def handle_process(pdf_input, description):
         # Extract and process text from PDF
         text = pdf_to_text(pdf_input.name)
         sentences = split_text_into_sentences(text)
         for sentence in sentences:
             # Generate audio for each sentence
             sample_rate, audio_arr = generate_single_wav_from_text(sentence, description)
-            yield gr.Audio.update(value=(sample_rate, audio_arr)), f"**Sentence**: {sentence}"
-    def run_pipeline(pdf_input, description):
         # Stream outputs to Gradio interface
-        for audio_component, markdown_component in handle_process(pdf_input, description):
-            yield audio_component, gr.Markdown(markdown_component)
-    run_button.click(run_pipeline, inputs=[pdf_input, description], outputs=[audio_output])
 demo.queue()
 demo.launch(share=True)

     sentences = sentence_endings.split(text)
     return [sentence.strip() for sentence in sentences if sentence.strip()]
+# Translation function
+@spaces.GPU(duration=120)
+def translate(source_text, source_lang, target_lang, batch_size=16):
+    model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
+    tokenizer = MarianTokenizer.from_pretrained(model_name)
+    model = MarianMTModel.from_pretrained(model_name).to(device)
+    text_chunks = textwrap.wrap(source_text, 512)
+    translated_text = ""
+    for i in range(0, len(text_chunks), batch_size):
+        text_batch = text_chunks[i:i+batch_size]
+        input_ids = tokenizer(text_batch, return_tensors="pt", padding=True, truncation=True, max_length=512).input_ids.to(device)
+        output_ids = model.generate(input_ids, max_new_tokens=512)
+        for output in output_ids:
+            output_text = tokenizer.decode(output, skip_special_tokens=True)
+            translated_text += output_text + " "
+    return translated_text
+# Function to preprocess the text (normalization, punctuation)
 def preprocess(text):
     text = text.replace("-", " ")
     if text[-1] not in ".!?":
         prompt_attention_mask=prompt.attention_mask, do_sample=True, temperature=1.0
     )
     audio_arr = generation.cpu().numpy().squeeze()
     return SAMPLE_RATE, audio_arr
 # Gradio Interface
     with gr.Row():
         with gr.Column():
             pdf_input = gr.File(label="Upload PDF", file_types=['pdf'])
+            translate_checkbox = gr.Checkbox(label="Enable Translation", value=False)
+            source_lang = gr.Dropdown(choices=["en", "tr", "de", "fr"], label="Source Language", value="en", interactive=True)
+            target_lang = gr.Dropdown(choices=["tr"], label="Target Language", value="tr", interactive=True)
             description = gr.Textbox(label="Voice Description", lines=2,
                                      value="Old man voice. Monotone voice tune from an old man, with a very close recording that almost has no background noise.")
             run_button = gr.Button("Generate Audio", variant="primary")
         with gr.Column():
             audio_output = gr.Audio(label="Generated Audio")
+            markdown_output = gr.Markdown()
+    def handle_process(pdf_input, translate_checkbox, source_lang, target_lang, description):
         # Extract and process text from PDF
         text = pdf_to_text(pdf_input.name)
+        # Perform translation if enabled
+        if translate_checkbox:
+            text = translate(text, source_lang, target_lang)
         sentences = split_text_into_sentences(text)
         for sentence in sentences:
             # Generate audio for each sentence
             sample_rate, audio_arr = generate_single_wav_from_text(sentence, description)
+            yield (sample_rate, audio_arr), f"**Sentence**: {sentence}"
+    def run_pipeline(pdf_input, translate_checkbox, source_lang, target_lang, description):
         # Stream outputs to Gradio interface
+        for audio_data, markdown_text in handle_process(pdf_input, translate_checkbox, source_lang, target_lang, description):
+            yield audio_data, markdown_text
+    def handle_translation_toggle(translate_checkbox):
+        if translate_checkbox:
+            return gr.update(visible=True), gr.update(visible=True)
+        else:
+            return gr.update(visible=False), gr.update(visible=False)
+    translate_checkbox.change(fn=handle_translation_toggle, inputs=translate_checkbox, outputs=[source_lang, target_lang])
+    source_lang.change(fn=lambda lang: gr.update(choices={"en": ["de", "fr", "tr"], "tr": ["en"], "de": ["en", "fr"], "fr": ["en", "de"]}.get(lang, [])), inputs=source_lang, outputs=target_lang)
+    run_button.click(run_pipeline, inputs=[pdf_input, translate_checkbox, source_lang, target_lang, description], outputs=[audio_output, markdown_output])
 demo.queue()
 demo.launch(share=True)