import gradio as gr from PIL import Image import os from IndicPhotoOCR.ocr import OCR # Ensure OCR class is saved in a file named ocr.py from IndicPhotoOCR.theme import Seafoam from IndicPhotoOCR.utils.helper import detect_para from transformers import ( AutoModelForSeq2SeqLM, AutoTokenizer, ) import numpy as np import torch from collections import Counter def Most_Common(lst): data = Counter(lst) return data.most_common(1)[0][0] from IndicTransToolkit import IndicProcessor # Initialize the OCR object for text detection and recognition ocr = OCR(device='cpu',verbose=False) def translate(given_str,lang='hindi'): DEVICE = 'cpu' model_name = "ai4bharat/indictrans2-en-indic-1B" if lang=="english" else "ai4bharat/indictrans2-indic-en-1B" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) model = AutoModelForSeq2SeqLM.from_pretrained(model_name, trust_remote_code=True) ip = IndicProcessor(inference=True) model = model.to(DEVICE) model.eval() src_lang, tgt_lang = ("eng_Latn", "hin_Deva") if lang=="english" else ("hin_Deva", "eng_Latn" ) batch = ip.preprocess_batch( [given_str], src_lang=src_lang, tgt_lang=tgt_lang, ) inputs = tokenizer( batch, truncation=True, padding="longest", return_tensors="pt", return_attention_mask=True, ).to(DEVICE) with torch.no_grad(): generated_tokens = model.generate( **inputs, use_cache=True, min_length=0, max_length=256, num_beams=5, num_return_sequences=1, ) # Decode the generated tokens into text with tokenizer.as_target_tokenizer(): generated_tokens = tokenizer.batch_decode( generated_tokens.detach().cpu().tolist(), skip_special_tokens=True, clean_up_tokenization_spaces=True, ) translation = ip.postprocess_batch(generated_tokens, lang=tgt_lang)[0] return translation def process_image(image): """ Processes the uploaded image for text detection and recognition. - Detects bounding boxes in the image - Draws bounding boxes on the image and identifies script in each detected area - Recognizes text in each cropped region and returns the annotated image and recognized text Parameters: image (PIL.Image): The input image to be processed. Returns: tuple: A PIL.Image with bounding boxes and a string of recognized text. """ # Save the input image temporarily image_path = "input_image.jpg" image.save(image_path) # Detect bounding boxes on the image using OCR detections = ocr.detect(image_path) # Draw bounding boxes on the image and save it as output ocr.visualize_detection(image_path, detections, save_path="output_image.png") # Load the annotated image with bounding boxes drawn output_image = Image.open("output_image.png") # Initialize list to hold recognized text from each detected area recognized_texts = {} pil_image = Image.open(image_path) # # Process each detected bounding box for script identification and text recognition # for bbox in detections: # # Identify the script and crop the image to this region # script_lang, cropped_path = ocr.crop_and_identify_script(pil_image, bbox) # if script_lang: # Only proceed if a script language is identified # # Recognize text in the cropped area # recognized_text = ocr.recognise(cropped_path, script_lang) # recognized_texts.append(recognized_text) langs = [] for id, bbox in enumerate(detections): # Identify the script and crop the image to this region script_lang, cropped_path = ocr.crop_and_identify_script(pil_image, bbox) # Calculate bounding box coordinates x1 = min([bbox[i][0] for i in range(len(bbox))]) y1 = min([bbox[i][1] for i in range(len(bbox))]) x2 = max([bbox[i][0] for i in range(len(bbox))]) y2 = max([bbox[i][1] for i in range(len(bbox))]) if script_lang: recognized_text = ocr.recognise(cropped_path, script_lang) recognized_texts[f"img_{id}"] = {"txt": recognized_text, "bbox": [x1, y1, x2, y2]} langs.append(script_lang) # Combine recognized texts into a single string for display # recognized_texts_combined = " ".join(recognized_texts) string = detect_para(recognized_texts) recognized_texts_combined = '\n'.join([' '.join(line) for line in string]) recognized_texts_combined = translate(recognized_texts_combined,Most_Common(langs)) return output_image, recognized_texts_combined # Custom HTML for interface header with logos and alignment interface_html = """
IITJ Logo
Bhashini Logo
""" # Links to GitHub and Dataset repositories with GitHub icon links_html = """
GitHub Repository Dataset Repository
""" # Custom CSS to style the text box font size custom_css = """ .custom-textbox textarea { font-size: 20px !important; } """ # Create an instance of the Seafoam theme for a consistent visual style seafoam = Seafoam() # Define examples for users to try out examples = [ ["test_images/208.jpg"], ["test_images/1310.jpg"] ] title = "

Developed by IITJ

" # Set up the Gradio Interface with the defined function and customizations demo = gr.Interface( fn=process_image, inputs=gr.Image(type="pil", image_mode="RGB"), outputs=[ gr.Image(type="pil", label="Detected Bounding Boxes"), gr.Textbox(label="Translated Text", elem_classes="custom-textbox") ], title="Scene Text Translator", description=title+interface_html+links_html, theme=seafoam, css=custom_css, examples=examples ) # # Server setup and launch configuration # if __name__ == "__main__": # server = "0.0.0.0" # IP address for server # port = 7867 # Port to run the server on # demo.launch(server_name=server, server_port=port) demo.launch()