from google.cloud import vision from google.oauth2 import service_account import pandas as pd import json import numpy as np import io import requests from preprocess import cam_scanner_filter image_ext = ("*.jpg", "*.jpeg", "*.png") class VisionClient: def __init__(self, auth): credentials = service_account.Credentials.from_service_account_info( auth ) self.client = vision.ImageAnnotatorClient(credentials=credentials) def send_request(self, image): try: image = vision.Image(content=image) except ValueError as e: print("Image could not be read") return response = self.client.document_text_detection(image, timeout=60) return response def get_response(self, content): try: resp_js = self.send_request(content) except Exception as e: print("OCR request failed. Reason : {}".format(e)) return resp_js def post_process(self, resp_js): boxObjects = [] for i in range(1, len(resp_js.text_annotations)): # We need to do that because vision sometimes reverse the left and right coords so then we have negative # width which causes problems when drawing link buttons obj = resp_js if obj.text_annotations[i].bounding_poly.vertices[1].x > obj.text_annotations[i].bounding_poly.vertices[3].x: leftX = obj.text_annotations[i].bounding_poly.vertices[3].x else: leftX = obj.text_annotations[i].bounding_poly.vertices[1].x if obj.text_annotations[i].bounding_poly.vertices[1].x > obj.text_annotations[i].bounding_poly.vertices[3].x: rightX = obj.text_annotations[i].bounding_poly.vertices[1].x else: rightX = obj.text_annotations[i].bounding_poly.vertices[3].x boxObjects.append({ "id": i-1, "text": obj.text_annotations[i].description, "left": leftX, "width": rightX - leftX, "top": obj.text_annotations[i].bounding_poly.vertices[1].y, "height":obj.text_annotations[i].bounding_poly.vertices[3].y - obj.text_annotations[i].bounding_poly.vertices[1].y }) return boxObjects def convert_to_df(self, boxObjects, image): ocr_df = pd.DataFrame(boxObjects) # ocr_df = ocr_df.sort_values(by=['top', 'left'], ascending=True).reset_index(drop=True) width, height = image.size w_scale = 1000/width h_scale = 1000/height ocr_df = ocr_df.dropna() \ .assign(left_scaled = ocr_df.left*w_scale, width_scaled = ocr_df.width*w_scale, top_scaled = ocr_df.top*h_scale, height_scaled = ocr_df.height*h_scale, right_scaled = lambda x: x.left_scaled + x.width_scaled, bottom_scaled = lambda x: x.top_scaled + x.height_scaled) float_cols = ocr_df.select_dtypes('float').columns ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int) ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True) ocr_df = ocr_df.dropna().reset_index(drop=True) return ocr_df def ocr(self, content, image): resp_js = self.get_response(content) boxObjects = self.post_process(resp_js) ocr_df = self.convert_to_df(boxObjects, image) return ocr_df class TrOCRClient(): def __init__(self, api_url): self.api_url = api_url def convert_to_df(self, boxObjects, image): ocr_df = pd.DataFrame(boxObjects) # ocr_df = ocr_df.sort_values(by=['top', 'left'], ascending=True).reset_index(drop=True) width, height = image.size w_scale = 1000/width h_scale = 1000/height ocr_df = ocr_df.dropna() \ .assign(left_scaled = ocr_df.left*w_scale, width_scaled = ocr_df.width*w_scale, top_scaled = ocr_df.top*h_scale, height_scaled = ocr_df.height*h_scale, right_scaled = lambda x: x.left_scaled + x.width_scaled, bottom_scaled = lambda x: x.top_scaled + x.height_scaled) float_cols = ocr_df.select_dtypes('float').columns ocr_df[float_cols] = ocr_df[float_cols].round(0).astype(int) ocr_df = ocr_df.replace(r'^\s*$', np.nan, regex=True) ocr_df = ocr_df.dropna().reset_index(drop=True) return ocr_df def send_request(self, handwritten_img): jpeg_bytes = io.BytesIO() handwritten_img.save(jpeg_bytes, format='JPEG') jpeg_content = jpeg_bytes.getvalue() # Send a POST request with the image file response = requests.post(self.api_url, files={"file": jpeg_content}) # Check the response status code if response.status_code == 200: # Get the extracted text from the response extracted_text = response.json()["ocr_result"] else: print(f"Error: {response.text}") return extracted_text def ocr(self, handwritten_imgs, image): boxObjects = [] for i in range(len(handwritten_imgs)): handwritten_img = handwritten_imgs[i] handwritten_img_processed = cam_scanner_filter(handwritten_img[0]) ocr_result = self.send_request(handwritten_img_processed) boxObjects.append({ "id": i-1, "text": ocr_result, "left": handwritten_img[1], "width": handwritten_img[3], "top": handwritten_img[2], "height":handwritten_img[4] }) ocr_df = self.convert_to_df(boxObjects, image) return ocr_df