Spaces:

kavg
/

sri-doc

Runtime error

App Files Files Community

kavg commited on Apr 30

Commit

16f0a9b

•

1 Parent(s): 75c54a9

commit before changing entity merging process

Browse files

Files changed (4) hide show

main.py +23 -13
ocr.py +4 -4
preprocess.py +9 -0
token_classification.py +1 -1

main.py CHANGED Viewed

@@ -44,9 +44,13 @@ async def ProcessDocument(file: UploadFile):
     raise HTTPException(status_code=400, detail="Cannot apply OCR to the image")
   try:
     tokenClassificationOutput, img_size = LabelTokens(ocr_df, image)
-    reOutput = ExtractRelations(tokenClassificationOutput, ocr_df, img_size)
   except:
-    raise HTTPException(status_code=400, detail="Invalid Image")
   return reOutput
 @app.post("/submit-doc-base64")
@@ -78,22 +82,27 @@ def ApplyOCR(content):
   except:
     raise HTTPException(status_code=400, detail="Handwritting detection failed")
-  try:
-    trocr_client = ocr.TrOCRClient(config['settings'].TROCR_API_URL)
-    handwritten_ocr_df = trocr_client.ocr(handwritten_imgs, image)
-  except:
-    raise HTTPException(status_code=400, detail="handwritten OCR process failed")
   try:
     jpeg_bytes = io.BytesIO()
-    printed_img.save(jpeg_bytes, format='JPEG')
-    jpeg_content = jpeg_bytes.getvalue()
     vision_client = ocr.VisionClient(config['settings'].GCV_AUTH)
-    printed_ocr_df = vision_client.ocr(jpeg_content, printed_img)
-  except:
     raise HTTPException(status_code=400, detail="Printed OCR process failed")
   ocr_df = pd.concat([handwritten_ocr_df, printed_ocr_df])
   return ocr_df, image
@@ -103,13 +112,14 @@ def LabelTokens(ocr_df, image):
   return {"token_labels": token_labels, "input_ids": input_ids, "bbox":bbox, "attention_mask":attention_mask}, image.size
 def ExtractRelations(tokenClassificationOutput, ocr_df, img_size):
   token_labels = tokenClassificationOutput['token_labels']
   input_ids = tokenClassificationOutput['input_ids']
   attention_mask = tokenClassificationOutput["attention_mask"]
   bbox_org = tokenClassificationOutput["bbox"]
   merged_output, merged_words = token_classification.createEntities(config['ser_model'], token_labels, input_ids, ocr_df, config['tokenizer'], img_size, bbox_org)
   entities = merged_output['entities']
   input_ids = torch.tensor([merged_output['input_ids']]).to(config['device'])
   bbox = torch.tensor([merged_output['bbox']]).to(config['device'])

     raise HTTPException(status_code=400, detail="Cannot apply OCR to the image")
   try:
     tokenClassificationOutput, img_size = LabelTokens(ocr_df, image)
   except:
+    raise HTTPException(status_code=400, detail="Entity identification failed")
+  try:
+   reOutput = ExtractRelations(tokenClassificationOutput, ocr_df, img_size)
+  except:
+    raise HTTPException(status_code=400, detail="Relation extraction failed")
   return reOutput
 @app.post("/submit-doc-base64")
   except:
     raise HTTPException(status_code=400, detail="Handwritting detection failed")
   try:
     jpeg_bytes = io.BytesIO()
+    printed_img.save(jpeg_bytes, format='PNG')
+    # printed_img.save('temp/printed_text_image.jpeg', format='PNG')
+    printed_content = jpeg_bytes.getvalue()
     vision_client = ocr.VisionClient(config['settings'].GCV_AUTH)
+    printed_ocr_df = vision_client.ocr(printed_content, printed_img)
+    # printed_ocr_df.to_csv('temp/complete_image_ocr.csv', index=False)
+    # return  printed_ocr_df, image
+  except Exception as e:
     raise HTTPException(status_code=400, detail="Printed OCR process failed")
+  try:
+    trocr_client = ocr.TrOCRClient(config['settings'].TROCR_API_URL)
+    handwritten_ocr_df = trocr_client.ocr(handwritten_imgs, image)
+  except Exception as e:
+    print(e)
+    raise HTTPException(status_code=400, detail="handwritten OCR process failed")
   ocr_df = pd.concat([handwritten_ocr_df, printed_ocr_df])
+  # ocr_df = printed_ocr_df
   return ocr_df, image
   return {"token_labels": token_labels, "input_ids": input_ids, "bbox":bbox, "attention_mask":attention_mask}, image.size
 def ExtractRelations(tokenClassificationOutput, ocr_df, img_size):
+  print(tokenClassificationOutput)
   token_labels = tokenClassificationOutput['token_labels']
   input_ids = tokenClassificationOutput['input_ids']
   attention_mask = tokenClassificationOutput["attention_mask"]
   bbox_org = tokenClassificationOutput["bbox"]
   merged_output, merged_words = token_classification.createEntities(config['ser_model'], token_labels, input_ids, ocr_df, config['tokenizer'], img_size, bbox_org)
   entities = merged_output['entities']
   input_ids = torch.tensor([merged_output['input_ids']]).to(config['device'])
   bbox = torch.tensor([merged_output['bbox']]).to(config['device'])

ocr.py CHANGED Viewed

@@ -1,12 +1,11 @@
 from google.cloud import vision
 from google.oauth2 import service_account
-from google.protobuf.json_format import MessageToJson
 import pandas as pd
 import json
 import numpy as np
-from PIL import Image
 import io
 import requests
 image_ext = ("*.jpg", "*.jpeg", "*.png")
@@ -23,7 +22,7 @@ class VisionClient:
         except ValueError as e:
             print("Image could not be read")
             return
-        response = self.client.document_text_detection(image, timeout=10)
         return response
     def get_response(self, content):
@@ -134,7 +133,8 @@ class TrOCRClient():
         boxObjects = []
         for i in range(len(handwritten_imgs)):
             handwritten_img = handwritten_imgs[i]
-            ocr_result = self.send_request(handwritten_img[0])
             boxObjects.append({
                 "id": i-1,
                 "text": ocr_result,

 from google.cloud import vision
 from google.oauth2 import service_account
 import pandas as pd
 import json
 import numpy as np
 import io
 import requests
+from preprocess import cam_scanner_filter
 image_ext = ("*.jpg", "*.jpeg", "*.png")
         except ValueError as e:
             print("Image could not be read")
             return
+        response = self.client.document_text_detection(image, timeout=60)
         return response
     def get_response(self, content):
         boxObjects = []
         for i in range(len(handwritten_imgs)):
             handwritten_img = handwritten_imgs[i]
+            handwritten_img_processed = cam_scanner_filter(handwritten_img[0])
+            ocr_result = self.send_request(handwritten_img_processed)
             boxObjects.append({
                 "id": i-1,
                 "text": ocr_result,

preprocess.py CHANGED Viewed

@@ -1,5 +1,8 @@
 import torch
 from transformers import AutoTokenizer
 def normalize_box(box, width, height):
     return [
@@ -9,6 +12,12 @@ def normalize_box(box, width, height):
         int(1000 * (box[3] / height)),
     ]
 # class to turn the keys of a dict into attributes (thanks Stackoverflow)
 class AttrDict(dict):
     def __init__(self, *args, **kwargs):

 import torch
 from transformers import AutoTokenizer
+import cv2
+from PIL import Image
+import numpy as np
 def normalize_box(box, width, height):
     return [
         int(1000 * (box[3] / height)),
     ]
+def cam_scanner_filter(img):
+    image1 = np.array(img)
+    img = cv2.cvtColor(image1, cv2.COLOR_BGR2GRAY)
+    thresh2 = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY, 199, 15)
+    return Image.fromarray(thresh2)
 # class to turn the keys of a dict into attributes (thanks Stackoverflow)
 class AttrDict(dict):
     def __init__(self, *args, **kwargs):

token_classification.py CHANGED Viewed

@@ -198,7 +198,7 @@ def createEntities(model, predictions, input_ids, ocr_df, tokenizer, img_size, b
     merged_words = []
     for i,merged_tagging in enumerate(merged_taggings):
-        if len(merged_tagging) > 1:
             new_word = {}
             merging_word = " ".join([word['text'] for word in merged_tagging])
             merging_box = [merged_tagging[0]['box'][0]-5,merged_tagging[0]['box'][1]-10,merged_tagging[-1]['box'][2]+5,merged_tagging[-1]['box'][3]+10]

     merged_words = []
     for i,merged_tagging in enumerate(merged_taggings):
+        if ((len(merged_tagging) > 1) or (merged_tagging['label']=='ANSWER')):
             new_word = {}
             merging_word = " ".join([word['text'] for word in merged_tagging])
             merging_box = [merged_tagging[0]['box'][0]-5,merged_tagging[0]['box'][1]-10,merged_tagging[-1]['box'][2]+5,merged_tagging[-1]['box'][3]+10]