Update app.py
Browse files
app.py
CHANGED
@@ -14,19 +14,21 @@ from happytransformer import HappyTextToText, TTSettings
|
|
14 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,logging
|
15 |
from transformers.integrations import deepspeed
|
16 |
import re
|
|
|
17 |
import torch
|
18 |
-
|
19 |
-
|
20 |
-
T2TT_TARGET_LANGUAGE_NAMES,
|
21 |
-
TEXT_SOURCE_LANGUAGE_NAMES,
|
22 |
-
)
|
23 |
logging.set_verbosity_error()
|
24 |
|
25 |
DEFAULT_TARGET_LANGUAGE = "English"
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
30 |
|
31 |
|
32 |
import pytesseract as pt
|
@@ -174,22 +176,59 @@ def split_text_into_batches(text, max_tokens_per_batch):
|
|
174 |
@spaces.GPU(duration=60)
|
175 |
def run_t2tt(file_uploader , input_text: str, source_language: str, target_language: str) -> (str, bytes):
|
176 |
if file_uploader is not None:
|
177 |
-
with open(file_uploader,
|
178 |
-
input_text=file.read()
|
179 |
-
|
180 |
-
|
181 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
batches = split_text_into_batches(input_text, max_tokens_per_batch)
|
183 |
translated_text = ""
|
|
|
184 |
for batch in batches:
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
190 |
_output_name = "result.txt"
|
191 |
-
open(_output_name,
|
192 |
-
|
|
|
|
|
193 |
|
194 |
with gr.Blocks() as demo_t2tt:
|
195 |
with gr.Row():
|
|
|
14 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,logging
|
15 |
from transformers.integrations import deepspeed
|
16 |
import re
|
17 |
+
from IndicTransToolkit import IndicProcessor
|
18 |
import torch
|
19 |
+
import torch
|
20 |
+
|
|
|
|
|
|
|
21 |
logging.set_verbosity_error()
|
22 |
|
23 |
DEFAULT_TARGET_LANGUAGE = "English"
|
24 |
+
# Load IndicTrans2 model
|
25 |
+
model_name = "ai4bharat/indictrans2-indic-indic-dist-320M"
|
26 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
27 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True)
|
28 |
+
ip = IndicProcessor(inference=True)
|
29 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
30 |
+
model.to(DEVICE)
|
31 |
+
|
32 |
|
33 |
|
34 |
import pytesseract as pt
|
|
|
176 |
@spaces.GPU(duration=60)
|
177 |
def run_t2tt(file_uploader , input_text: str, source_language: str, target_language: str) -> (str, bytes):
|
178 |
if file_uploader is not None:
|
179 |
+
with open(file_uploader.name, "r", encoding="utf-8") as file:
|
180 |
+
input_text = file.read()
|
181 |
+
|
182 |
+
# Language mapping
|
183 |
+
lang_code_map = {
|
184 |
+
"Hindi": "hin_Deva",
|
185 |
+
"Punjabi": "pan_Guru",
|
186 |
+
"English": "eng_Latn",
|
187 |
+
}
|
188 |
+
|
189 |
+
src_lang = lang_code_map[source_language]
|
190 |
+
tgt_lang = lang_code_map[target_language]
|
191 |
+
|
192 |
+
max_tokens_per_batch = 256
|
193 |
batches = split_text_into_batches(input_text, max_tokens_per_batch)
|
194 |
translated_text = ""
|
195 |
+
|
196 |
for batch in batches:
|
197 |
+
batch_preprocessed = ip.preprocess_batch([batch], src_lang=src_lang, tgt_lang=tgt_lang)
|
198 |
+
inputs = tokenizer(
|
199 |
+
batch_preprocessed,
|
200 |
+
truncation=True,
|
201 |
+
padding="longest",
|
202 |
+
return_tensors="pt",
|
203 |
+
return_attention_mask=True,
|
204 |
+
).to(DEVICE)
|
205 |
+
|
206 |
+
with torch.no_grad():
|
207 |
+
generated_tokens = model.generate(
|
208 |
+
**inputs,
|
209 |
+
use_cache=True,
|
210 |
+
min_length=0,
|
211 |
+
max_length=256,
|
212 |
+
num_beams=5,
|
213 |
+
num_return_sequences=1,
|
214 |
+
)
|
215 |
+
|
216 |
+
with tokenizer.as_target_tokenizer():
|
217 |
+
decoded_tokens = tokenizer.batch_decode(
|
218 |
+
generated_tokens.detach().cpu().tolist(),
|
219 |
+
skip_special_tokens=True,
|
220 |
+
clean_up_tokenization_spaces=True,
|
221 |
+
)
|
222 |
+
|
223 |
+
translations = ip.postprocess_batch(decoded_tokens, lang=tgt_lang)
|
224 |
+
translated_text += " ".join(translations) + " "
|
225 |
+
|
226 |
+
output = translated_text.strip()
|
227 |
_output_name = "result.txt"
|
228 |
+
with open(_output_name, "w", encoding="utf-8") as out_file:
|
229 |
+
out_file.write(output)
|
230 |
+
|
231 |
+
return output, _output_name
|
232 |
|
233 |
with gr.Blocks() as demo_t2tt:
|
234 |
with gr.Row():
|