emirhanbilgic commited on
Commit
6ec69c0
·
verified ·
1 Parent(s): 224badc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -58
app.py CHANGED
@@ -1,14 +1,12 @@
1
- import spaces
2
  import gradio as gr
3
  import torch
4
- from transformers import MarianTokenizer, MarianMTModel
5
  from parler_tts import ParlerTTSForConditionalGeneration
6
- from transformers import AutoTokenizer, AutoFeatureExtractor, set_seed
7
  from PyPDF2 import PdfReader
8
  import re
9
  import textwrap
10
  import soundfile as sf
11
- import numpy as np
12
 
13
  # Device configuration
14
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -21,13 +19,12 @@ SAMPLE_RATE = feature_extractor.sampling_rate
21
  SEED = 42
22
 
23
  # Helper function to extract text from a PDF
24
- def pdf_to_text(pdf_path):
25
- with open(pdf_path, 'rb') as file:
26
  pdf_reader = PdfReader(file)
27
  text = ""
28
- for page_num in range(len(pdf_reader.pages)):
29
- page = pdf_reader.pages[page_num]
30
- text += page.extract_text()
31
  return text
32
 
33
  # Helper function to split text into sentences using regex
@@ -37,10 +34,8 @@ def split_text_into_sentences(text):
37
  return [sentence.strip() for sentence in sentences if sentence.strip()]
38
 
39
  # Translation function
40
- @spaces.GPU(duration=120)
41
  def translate(source_text, source_lang, target_lang, batch_size=16):
42
  model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
43
-
44
  tokenizer = MarianTokenizer.from_pretrained(model_name)
45
  model = MarianMTModel.from_pretrained(model_name).to(device)
46
 
@@ -58,19 +53,16 @@ def translate(source_text, source_lang, target_lang, batch_size=16):
58
 
59
  return translated_text
60
 
61
- # Function to preprocess the text (normalization, punctuation)
62
- def preprocess(text):
63
- text = text.replace("-", " ")
64
- if text[-1] not in ".!?":
65
- text += "."
66
- return text
67
 
68
  # Function to generate audio for a single sentence
69
- @spaces.GPU(duration=120)
70
  def generate_single_wav_from_text(sentence, description):
71
- set_seed(SEED)
72
  inputs = tts_tokenizer(description.strip(), return_tensors="pt").to(device)
73
- prompt = tts_tokenizer(preprocess(sentence), return_tensors="pt").to(device)
74
 
75
  generation = tts_model.generate(
76
  input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask,
@@ -83,7 +75,9 @@ def generate_single_wav_from_text(sentence, description):
83
  with gr.Blocks() as demo:
84
  with gr.Row():
85
  with gr.Column():
86
- pdf_input = gr.File(label="Upload PDF", file_types=['pdf'])
 
 
87
  translate_checkbox = gr.Checkbox(label="Enable Translation", value=False)
88
  source_lang = gr.Dropdown(choices=["en", "tr", "de", "fr"], label="Source Language", value="en", interactive=True)
89
  target_lang = gr.Dropdown(choices=["tr"], label="Target Language", value="tr", interactive=True)
@@ -94,55 +88,34 @@ with gr.Blocks() as demo:
94
  audio_output = gr.Audio(label="Generated Audio")
95
  markdown_output = gr.Markdown()
96
 
97
- # Helper function to combine audio arrays
98
- def combine_audio_arrays(audio_list):
99
- combined_audio = np.concatenate(audio_list, axis=0)
100
- return combined_audio
101
-
102
- # Adjust the handle_process function to accumulate and combine audio
103
- def handle_process(pdf_input, translate_checkbox, source_lang, target_lang, description):
104
- # Extract and process text from PDF
105
- print("Extracting text from PDF...")
106
- text = pdf_to_text(pdf_input.name)
107
- print(f"Extracted text: {text[:100]}...") # Display the first 100 characters for a quick preview
108
 
109
- # Perform translation if enabled
110
  if translate_checkbox:
111
- print("Translating text...")
112
  text = translate(text, source_lang, target_lang)
113
- print(f"Translated text: {text[:100]}...") # Display the first 100 characters for a quick preview
114
 
115
  sentences = split_text_into_sentences(text)
116
  all_audio = []
117
  all_text = ""
118
-
119
  for sentence in sentences:
120
- print(f"Processing sentence: {sentence[:50]}...") # Display the first 50 characters for a quick preview
121
  sample_rate, audio_arr = generate_single_wav_from_text(sentence, description)
122
  all_audio.append(audio_arr)
123
  combined_audio = combine_audio_arrays(all_audio)
124
  all_text += f"**Sentence**: {sentence}\n\n"
125
-
126
- # Yield the accumulated results
127
- yield sample_rate, combined_audio, all_text
128
-
129
- print("Processing complete.")
130
-
131
- # Update the Gradio interface pipeline function to handle combined audio
132
- def run_pipeline(pdf_input, translate_checkbox, source_lang, target_lang, description):
133
- # Stream outputs to Gradio interface
134
- for sample_rate, combined_audio, markdown_text in handle_process(pdf_input, translate_checkbox, source_lang, target_lang, description):
135
- yield (sample_rate, combined_audio), markdown_text
136
 
137
- def handle_translation_toggle(translate_checkbox):
138
- if translate_checkbox:
139
- return gr.update(visible=True), gr.update(visible=True)
140
- else:
141
- return gr.update(visible=False), gr.update(visible=False)
142
 
143
- translate_checkbox.change(fn=handle_translation_toggle, inputs=translate_checkbox, outputs=[source_lang, target_lang])
144
- source_lang.change(fn=lambda lang: gr.update(choices={"en": ["de", "fr", "tr"], "tr": ["en"], "de": ["en", "fr"], "fr": ["en", "de"]}.get(lang, [])), inputs=source_lang, outputs=target_lang)
145
- run_button.click(run_pipeline, inputs=[pdf_input, translate_checkbox, source_lang, target_lang, description], outputs=[audio_output, markdown_output])
146
 
147
- demo.queue()
148
- demo.launch(share=True)
 
1
+ import numpy as np
2
  import gradio as gr
3
  import torch
4
+ from transformers import MarianTokenizer, MarianMTModel, AutoTokenizer, AutoFeatureExtractor
5
  from parler_tts import ParlerTTSForConditionalGeneration
 
6
  from PyPDF2 import PdfReader
7
  import re
8
  import textwrap
9
  import soundfile as sf
 
10
 
11
  # Device configuration
12
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
19
  SEED = 42
20
 
21
  # Helper function to extract text from a PDF
22
+ def pdf_to_text(pdf_file):
23
+ with open(pdf_file, 'rb') as file:
24
  pdf_reader = PdfReader(file)
25
  text = ""
26
+ for page in pdf_reader.pages:
27
+ text += page.extract_text() or ""
 
28
  return text
29
 
30
  # Helper function to split text into sentences using regex
 
34
  return [sentence.strip() for sentence in sentences if sentence.strip()]
35
 
36
  # Translation function
 
37
  def translate(source_text, source_lang, target_lang, batch_size=16):
38
  model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"
 
39
  tokenizer = MarianTokenizer.from_pretrained(model_name)
40
  model = MarianMTModel.from_pretrained(model_name).to(device)
41
 
 
53
 
54
  return translated_text
55
 
56
+ # Function to combine audio arrays
57
+ def combine_audio_arrays(audio_list):
58
+ combined_audio = np.concatenate(audio_list, axis=0)
59
+ return combined_audio
 
 
60
 
61
  # Function to generate audio for a single sentence
 
62
  def generate_single_wav_from_text(sentence, description):
63
+ torch.manual_seed(SEED)
64
  inputs = tts_tokenizer(description.strip(), return_tensors="pt").to(device)
65
+ prompt = tts_tokenizer(sentence, return_tensors="pt").to(device)
66
 
67
  generation = tts_model.generate(
68
  input_ids=inputs.input_ids, prompt_input_ids=prompt.input_ids, attention_mask=inputs.attention_mask,
 
75
  with gr.Blocks() as demo:
76
  with gr.Row():
77
  with gr.Column():
78
+ input_mode = gr.Radio(choices=["Upload PDF", "Type Text"], label="Input Mode", value="Type Text")
79
+ pdf_input = gr.File(label="Upload PDF", file_types=['pdf'], visible=False)
80
+ text_input = gr.Textbox(label="Type your text here", visible=True, placeholder="Enter text here if not uploading a PDF...")
81
  translate_checkbox = gr.Checkbox(label="Enable Translation", value=False)
82
  source_lang = gr.Dropdown(choices=["en", "tr", "de", "fr"], label="Source Language", value="en", interactive=True)
83
  target_lang = gr.Dropdown(choices=["tr"], label="Target Language", value="tr", interactive=True)
 
88
  audio_output = gr.Audio(label="Generated Audio")
89
  markdown_output = gr.Markdown()
90
 
91
+ def handle_input(input_mode, pdf_input, text_input):
92
+ if input_mode == "Upload PDF":
93
+ return pdf_to_text(pdf_input.name)
94
+ else:
95
+ return text_input
96
+
97
+ def run_pipeline(input_mode, pdf_input, text_input, translate_checkbox, source_lang, target_lang, description):
98
+ text = handle_input(input_mode, pdf_input, text_input)
 
 
 
99
 
 
100
  if translate_checkbox:
 
101
  text = translate(text, source_lang, target_lang)
 
102
 
103
  sentences = split_text_into_sentences(text)
104
  all_audio = []
105
  all_text = ""
 
106
  for sentence in sentences:
 
107
  sample_rate, audio_arr = generate_single_wav_from_text(sentence, description)
108
  all_audio.append(audio_arr)
109
  combined_audio = combine_audio_arrays(all_audio)
110
  all_text += f"**Sentence**: {sentence}\n\n"
111
+ yield (sample_rate, combined_audio), all_text
 
 
 
 
 
 
 
 
 
 
112
 
113
+ input_mode.change(
114
+ fn=lambda choice: [gr.update(visible=choice == "Upload PDF"), gr.update(visible=choice == "Type Text")],
115
+ inputs=input_mode,
116
+ outputs=[pdf_input, text_input]
117
+ )
118
 
119
+ run_button.click(run_pipeline, inputs=[input_mode, pdf_input, text_input, translate_checkbox, source_lang, target_lang, description], outputs=[audio_output, markdown_output])
 
 
120
 
121
+ demo.launch(share=True)