import os import cv2 import numpy as np from PIL import Image, ImageDraw, ImageFont import json from paddleocr import PaddleOCR from transformers import pipeline import gradio as gr # Initialize PaddleOCR ocr = PaddleOCR(use_angle_cls=True, lang='en') # Predefined fields for extraction FIELDS = ["Scheme Name", "Folio Number", "Number of Units", "PAN", "Signature", "Tax Status", "Mobile Number", "Email", "Address", "Bank Account Details"] def draw_boxes_on_image(image, data): """ Draw bounding boxes and text on the image. Args: image (PIL Image): The input image. data (list): OCR results containing bounding boxes and detected text. Returns: PIL Image: The image with drawn boxes. """ draw = ImageDraw.Draw(image) try: font = ImageFont.truetype("arial.ttf", 20) except IOError: font = ImageFont.load_default() for item_id, item in enumerate(data, start=1): bounding_box, (text, confidence) = item box = np.array(bounding_box).astype(int) draw.line([tuple(box[0]), tuple(box[1])], fill="green", width=2) draw.line([tuple(box[1]), tuple(box[2])], fill="green", width=2) draw.line([tuple(box[2]), tuple(box[3])], fill="green", width=2) draw.line([tuple(box[3]), tuple(box[0])], fill="green", width=2) text_position = (box[0][0], box[0][1] - 20) draw.text(text_position, f"{item_id}: {text} ({confidence:.2f})", fill="red", font=font) return image def convert_to_json(results): """ Converts the OCR results into a JSON object with bounding box IDs. Args: results (list): The list of OCR results containing bounding box coordinates, text, and confidence. Returns: dict: JSON data with bounding boxes and text. """ json_data = [] for item_id, result in enumerate(results, start=1): bounding_box = result[0] text = result[1][0] confidence = result[1][1] json_data.append({ "id": item_id, "bounding_box": bounding_box, "text": text, "confidence": confidence }) return json_data def extract_field_value_pairs(text): """ Extract field-value pairs from the text using a pre-trained NLP model. Args: text (str): The text to be processed. Returns: dict: A dictionary with field-value pairs. """ nlp = pipeline("ner", model="mrm8488/bert-tiny-finetuned-sms-spam-detection") ner_results = [] chunk_size = 256 for i in range(0, len(text), chunk_size): chunk = text[i:i+chunk_size] ner_results.extend(nlp(chunk)) field_value_pairs = {} current_field = None for entity in ner_results: word = entity['word'] for field in FIELDS: if field.lower() in word.lower(): current_field = field break if current_field and entity['entity'] == "LABEL_1": field_value_pairs[current_field] = word return field_value_pairs def process_image(image): """ Process the uploaded image and perform OCR. Args: image (PIL Image): The input image. Returns: tuple: The image with bounding boxes, OCR results in JSON format, and field-value pairs. """ # Perform OCR on the image ocr_results = ocr.ocr(np.array(image), cls=True) # Draw boxes on the image image_with_boxes = draw_boxes_on_image(image.copy(), ocr_results[0]) # Convert OCR results to JSON json_results = convert_to_json(ocr_results[0]) json_results_path = 'ocr_results.json' # Save in the root directory with open(json_results_path, "w") as f: json.dump(json_results, f, indent=4) # Extract field-value pairs from the text text = " ".join([result[1][0] for result in ocr_results[0]]) field_value_pairs = extract_field_value_pairs(text) field_value_pairs_path = 'extracted_fields.json' # Save in the root directory with open(field_value_pairs_path, "w") as f: json.dump(field_value_pairs, f, indent=4) return image_with_boxes, json_results_path, field_value_pairs_path # Define Gradio interface iface = gr.Interface( fn=process_image, inputs=gr.Image(type="pil"), outputs=[ gr.Image(type="pil"), gr.File(label="Download OCR Results"), gr.File(label="Download Extracted Fields") ], live=True ) if __name__ == "__main__": iface.launch()