Spaces:

kavg
/

sri-doc

Runtime error

App Files Files Community

kavg commited on Dec 17, 2023

Commit

0dd8e27

•

1 Parent(s): 1ddbc38

added ned entity merging method. Included additional outputs in respons to match frontend streamlit app

Browse files

Files changed (4) hide show

main.py +37 -16
models.py +6 -1
preprocess.py +8 -8
token_classification.py +212 -28

main.py CHANGED Viewed

@@ -2,7 +2,7 @@ from config import Settings
 from preprocess import Preprocessor
 import ocr
 from PIL import Image
-from transformers import LiltForTokenClassification
 import token_classification
 import torch
 from fastapi import FastAPI, UploadFile
@@ -19,6 +19,7 @@ async def lifespan(app: FastAPI):
     config['device'] = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     config['vision_client'] = ocr.VisionClient(settings.GCV_AUTH)
     config['processor'] = Preprocessor(settings.TOKENIZER)
     config['ser_model'] = LiltForTokenClassification.from_pretrained(settings.SER_MODEL)
     config['re_model'] = LiLTRobertaLikeForRelationExtraction.from_pretrained(settings.RE_MODEL)
     yield
@@ -29,8 +30,8 @@ app = FastAPI(lifespan=lifespan)
 @app.post("/submit-doc")
 async def ProcessDocument(file: UploadFile):
-  tokenClassificationOutput = await LabelTokens(file)
-  reOutput = ExtractRelations(tokenClassificationOutput)
   return reOutput
 async def LabelTokens(file):
@@ -39,28 +40,48 @@ async def LabelTokens(file):
   ocr_df = config['vision_client'].ocr(content, image)
   input_ids, attention_mask, token_type_ids, bbox, token_actual_boxes, offset_mapping = config['processor'].process(ocr_df, image = image)
   token_labels = token_classification.classifyTokens(config['ser_model'], input_ids, attention_mask, bbox, offset_mapping)
-  return {"token_labels": token_labels, "input_ids": input_ids, "bbox":bbox, "offset_mapping":offset_mapping, "attention_mask":attention_mask}
-def ExtractRelations(tokenClassificationOutput):
   token_labels = tokenClassificationOutput['token_labels']
   input_ids = tokenClassificationOutput['input_ids']
-  offset_mapping =  tokenClassificationOutput["offset_mapping"]
   attention_mask = tokenClassificationOutput["attention_mask"]
-  bbox = tokenClassificationOutput["bbox"]
-  entities = token_classification.createEntities(config['ser_model'], token_labels, input_ids, offset_mapping)
   config['re_model'].to(config['device'])
-  entity_dict = {'start': [entity[0] for entity in entities], 'end': [entity[1] for entity in entities], 'label': [entity[3] for entity in entities]}
   relations = [{'start_index': [], 'end_index': [], 'head': [], 'tail': []}]
   with torch.no_grad():
     outputs = config['re_model'](input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, entities=[entity_dict], relations=relations)
-  print(type(outputs.pred_relations[0]))
-  print(type(entities))
-  print(type(input_ids))
-  print(type(bbox))
-  print(type(token_labels))
-  # "pred_relations":json.dumps(outputs.pred_relations[0]), "entities":json.dumps(entities), "input_ids": json.dumps(input_ids.tolist()),
-  return {"pred_relations":json.dumps(outputs.pred_relations[0]), "entities":json.dumps(entities), "input_ids": json.dumps(input_ids.tolist()), "bboxes": json.dumps(bbox.tolist()),"token_labels":json.dumps(token_labels)}

 from preprocess import Preprocessor
 import ocr
 from PIL import Image
+from transformers import LiltForTokenClassification, AutoTokenizer
 import token_classification
 import torch
 from fastapi import FastAPI, UploadFile
     config['device'] = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     config['vision_client'] = ocr.VisionClient(settings.GCV_AUTH)
     config['processor'] = Preprocessor(settings.TOKENIZER)
+    config['tokenizer'] = AutoTokenizer.from_pretrained(settings.TOKENIZER)
     config['ser_model'] = LiltForTokenClassification.from_pretrained(settings.SER_MODEL)
     config['re_model'] = LiLTRobertaLikeForRelationExtraction.from_pretrained(settings.RE_MODEL)
     yield
 @app.post("/submit-doc")
 async def ProcessDocument(file: UploadFile):
+  tokenClassificationOutput, ocr_df, img_size = await LabelTokens(file)
+  reOutput = ExtractRelations(tokenClassificationOutput, ocr_df, img_size)
   return reOutput
 async def LabelTokens(file):
   ocr_df = config['vision_client'].ocr(content, image)
   input_ids, attention_mask, token_type_ids, bbox, token_actual_boxes, offset_mapping = config['processor'].process(ocr_df, image = image)
   token_labels = token_classification.classifyTokens(config['ser_model'], input_ids, attention_mask, bbox, offset_mapping)
+  return {"token_labels": token_labels, "input_ids": input_ids, "bbox":bbox, "attention_mask":attention_mask}, ocr_df, image.size
+def ExtractRelations(tokenClassificationOutput, ocr_df, img_size):
   token_labels = tokenClassificationOutput['token_labels']
   input_ids = tokenClassificationOutput['input_ids']
   attention_mask = tokenClassificationOutput["attention_mask"]
+  bbox_org = tokenClassificationOutput["bbox"]
+  merged_output, merged_words = token_classification.createEntities(config['ser_model'], token_labels, input_ids, ocr_df, config['tokenizer'], img_size, bbox_org)
+  entities = merged_output['entities']
+  input_ids = torch.tensor([merged_output['input_ids']]).to(config['device'])
+  bbox = torch.tensor([merged_output['bbox']]).to(config['device'])
+  attention_mask = torch.tensor([merged_output['attention_mask']]).to(config['device'])
+  id2label = {"HEADER":0, "QUESTION":1, "ANSWER":2}
+  decoded_entities = []
+  for entity in entities:
+    decoded_entities.append((entity['label'], config['tokenizer'].decode(input_ids[0][entity['start']:entity['end']])))
+    entity['label'] = id2label[entity['label']]
   config['re_model'].to(config['device'])
+  entity_dict = {'start': [entity['start'] for entity in entities], 'end': [entity['end'] for entity in entities], 'label': [entity['label'] for entity in entities]}
   relations = [{'start_index': [], 'end_index': [], 'head': [], 'tail': []}]
   with torch.no_grad():
     outputs = config['re_model'](input_ids=input_ids, bbox=bbox, attention_mask=attention_mask, entities=[entity_dict], relations=relations)
+  decoded_pred_relations = []
+  for relation in outputs.pred_relations[0]:
+    head_start, head_end = relation['head']
+    tail_start, tail_end = relation['tail']
+    question =  config['tokenizer'].decode(input_ids[0][head_start:head_end])
+    answer = config['tokenizer'].decode(input_ids[0][tail_start:tail_end])
+    decoded_pred_relations.append((question, answer))
+    # print("Question:", question)
+    # print("Answer:", answer)
+    ## This prints bboxes of each question and answer
+    # for item in merged_words:
+    #     if item['text'] == question:
+    #       print('Question', item['box'])
+    #     if item['text'] == answer:
+    #       print('Answer', item['box'])
+    # print("----------")
+  return {"pred_relations":json.dumps(outputs.pred_relations[0]), "entities":json.dumps(entities), "input_ids": json.dumps(input_ids.tolist()), "bboxes": json.dumps(bbox_org.tolist()),"token_labels":json.dumps(token_labels), "decoded_entities": json.dumps(decoded_entities), "decoded_pred_relations":json.dumps(decoded_pred_relations)}

models.py CHANGED Viewed

@@ -196,6 +196,8 @@ class LiLTRobertaLikeForRelationExtraction(LiltPreTrainedModel):
         super().__init__(config)
         self.lilt = LiltModel(config, add_pooling_layer=False)
         self.rehead = REHead(config)
         self.init_weights()
@@ -216,6 +218,8 @@ class LiLTRobertaLikeForRelationExtraction(LiltPreTrainedModel):
         entities=None,
         relations=None,
     ):
         outputs = self.lilt(
             input_ids,
@@ -230,7 +234,8 @@ class LiLTRobertaLikeForRelationExtraction(LiltPreTrainedModel):
             return_dict=return_dict,
         )
         sequence_output = outputs[0]
         re_output = self.rehead(sequence_output, entities, relations)
         return re_output

         super().__init__(config)
         self.lilt = LiltModel(config, add_pooling_layer=False)
+        # self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # self.extractor = REDecoder(config, config.hidden_size)
         self.rehead = REHead(config)
         self.init_weights()
         entities=None,
         relations=None,
     ):
+        # for param in self.lilt.parameters():
+        #   param.requires_grad = False
         outputs = self.lilt(
             input_ids,
             return_dict=return_dict,
         )
+        seq_length = input_ids.size(1)
         sequence_output = outputs[0]
         re_output = self.rehead(sequence_output, entities, relations)
         return re_output

preprocess.py CHANGED Viewed

@@ -1,6 +1,14 @@
 import torch
 from transformers import AutoTokenizer
 # class to turn the keys of a dict into attributes (thanks Stackoverflow)
 class AttrDict(dict):
     def __init__(self, *args, **kwargs):
@@ -23,14 +31,6 @@ class Preprocessor():
             actual_box = [x, y, x+w, y+h] # we turn it into (left, top, left+widght, top+height) to get the actual box
             actual_boxes.append(actual_box)
-        def normalize_box(box, width, height):
-            return [
-                int(1000 * (box[0] / width)),
-                int(1000 * (box[1] / height)),
-                int(1000 * (box[2] / width)),
-                int(1000 * (box[3] / height)),
-            ]
         boxes = []
         for box in actual_boxes:
             boxes.append(normalize_box(box, width, height))

 import torch
 from transformers import AutoTokenizer
+def normalize_box(box, width, height):
+    return [
+        int(1000 * (box[0] / width)),
+        int(1000 * (box[1] / height)),
+        int(1000 * (box[2] / width)),
+        int(1000 * (box[3] / height)),
+    ]
 # class to turn the keys of a dict into attributes (thanks Stackoverflow)
 class AttrDict(dict):
     def __init__(self, *args, **kwargs):
             actual_box = [x, y, x+w, y+h] # we turn it into (left, top, left+widght, top+height) to get the actual box
             actual_boxes.append(actual_box)
         boxes = []
         for box in actual_boxes:
             boxes.append(normalize_box(box, width, height))

token_classification.py CHANGED Viewed

@@ -1,4 +1,6 @@
 import numpy as np
 def classifyTokens(model, input_ids, attention_mask, bbox, offset_mapping):
     outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask)
@@ -6,31 +8,213 @@ def classifyTokens(model, input_ids, attention_mask, bbox, offset_mapping):
     predictions = outputs.logits.argmax(-1).squeeze().tolist()
     return predictions
-def createEntities(model, predictions, input_ids, offset_mapping):
-    # we're only interested in tokens which aren't subwords
-    # we'll use the offset mapping for that
-    offset_mapping = np.array(offset_mapping)
-    is_subword = np.array(offset_mapping.squeeze().tolist())[:,0] != 0
-    id2label = {"HEADER":0, "QUESTION":1, "ANSWER":2}
-    # finally, store recognized "question" and "answer" entities in a list
-    entities = []
-    current_entity = None
-    start = None
-    end = None
-    for idx, (id, pred) in enumerate(zip(input_ids[0].tolist(), predictions)):
-        if not is_subword[idx]:
-            predicted_label = model.config.id2label[pred]
-            if predicted_label.startswith("B") and current_entity is None:
-                # means we're at the start of a new entity
-                current_entity = predicted_label.replace("B-", "")
-                start = idx
-            if current_entity is not None and current_entity not in predicted_label:
-                # means we're at the end of a new entity
-                end = idx
-                entities.append((start, end, current_entity, id2label[current_entity]))
-                current_entity = None
-    return entities

 import numpy as np
+from preprocess import normalize_box
+import copy
 def classifyTokens(model, input_ids, attention_mask, bbox, offset_mapping):
     outputs = model(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask)
     predictions = outputs.logits.argmax(-1).squeeze().tolist()
     return predictions
+def compare_boxes(b1,b2):
+  b1 = np.array([c for c in b1])
+  b2 = np.array([c for c in b2])
+  equal = np.array_equal(b1,b2)
+  return equal
+def mergable(w1,w2):
+  if w1['label'] == w2['label']:
+    threshold = 7
+    if abs(w1['box'][1] - w2['box'][1]) < threshold or abs(w1['box'][-1] - w2['box'][-1]) < threshold:
+      return True
+    return False
+  return False
+def convert_data(data, tokenizer, img_size):
+  def normalize_bbox(bbox, size):
+    return [
+        int(1000 * bbox[0] / size[0]),
+        int(1000 * bbox[1] / size[1]),
+        int(1000 * bbox[2] / size[0]),
+        int(1000 * bbox[3] / size[1]),
+    ]
+  def simplify_bbox(bbox):
+      return [
+          min(bbox[0::2]),
+          min(bbox[1::2]),
+          max(bbox[2::2]),
+          max(bbox[3::2]),
+      ]
+  def merge_bbox(bbox_list):
+    x0, y0, x1, y1 = list(zip(*bbox_list))
+    return [min(x0), min(y0), max(x1), max(y1)]
+  tokenized_doc = {"input_ids": [], "bbox": [], "labels": [], "attention_mask":[]}
+  entities = []
+  id2label = {}
+  entity_id_to_index_map = {}
+  empty_entity = set()
+  for line in data:
+      if len(line["text"]) == 0:
+          empty_entity.add(line["id"])
+          continue
+      id2label[line["id"]] = line["label"]
+      tokenized_inputs = tokenizer(
+          line["text"],
+          add_special_tokens=False,
+          return_offsets_mapping=True,
+          return_attention_mask=True,
+      )
+      text_length = 0
+      ocr_length = 0
+      bbox = []
+      for token_id, offset in zip(tokenized_inputs["input_ids"], tokenized_inputs["offset_mapping"]):
+          if token_id == 6:
+              bbox.append(None)
+              continue
+          text_length += offset[1] - offset[0]
+          tmp_box = []
+          while ocr_length < text_length:
+              ocr_word = line["words"].pop(0)
+              ocr_length += len(
+                  tokenizer._tokenizer.normalizer.normalize_str(ocr_word["text"].strip())
+              )
+              tmp_box.append(simplify_bbox(ocr_word["box"]))
+          if len(tmp_box) == 0:
+              tmp_box = last_box
+          bbox.append(normalize_bbox(merge_bbox(tmp_box), img_size))
+          last_box = tmp_box  # noqa
+      bbox = [
+          [bbox[i + 1][0], bbox[i + 1][1], bbox[i + 1][0], bbox[i + 1][1]] if b is None else b
+          for i, b in enumerate(bbox)
+      ]
+      if line["label"] == "other":
+          label = ["O"] * len(bbox)
+      else:
+          label = [f"I-{line['label'].upper()}"] * len(bbox)
+          label[0] = f"B-{line['label'].upper()}"
+      tokenized_inputs.update({"bbox": bbox, "labels": label})
+      if label[0] != "O":
+          entity_id_to_index_map[line["id"]] = len(entities)
+          entities.append(
+              {
+                  "start": len(tokenized_doc["input_ids"]),
+                  "end": len(tokenized_doc["input_ids"]) + len(tokenized_inputs["input_ids"]),
+                  "label": line["label"].upper(),
+              }
+          )
+      for i in tokenized_doc:
+          tokenized_doc[i] = tokenized_doc[i] + tokenized_inputs[i]
+  chunk_size = 512
+  output = {}
+  for chunk_id, index in enumerate(range(0, len(tokenized_doc["input_ids"]), chunk_size)):
+    item = {}
+    entities_in_this_span = []
+    for k in tokenized_doc:
+        item[k] = tokenized_doc[k][index : index + chunk_size]
+    global_to_local_map = {}
+    for entity_id, entity in enumerate(entities):
+        if (
+            index <= entity["start"] < index + chunk_size
+            and index <= entity["end"] < index + chunk_size
+        ):
+            entity["start"] = entity["start"] - index
+            entity["end"] = entity["end"] - index
+            global_to_local_map[entity_id] = len(entities_in_this_span)
+            entities_in_this_span.append(entity)
+    item.update(
+        {
+            "entities": entities_in_this_span
+        }
+    )
+    for key in item.keys():
+      output[key] = output.get(key, []) + item[key]
+  return output
+def dfs(i, merged, width, height, visited, df_words):
+    v_threshold = int(.01 * height)
+    h_threshold = int(.08 * width)
+    visited.add(i)
+    merged.append(df_words[i])
+    for j in range(len(df_words)):
+        if j not in visited:
+            w1 = df_words[i]['words'][0]
+            w2 = df_words[j]['words'][0]
+            # and
+            if (abs(w1['box'][1] - w2['box'][1]) < v_threshold or abs(w1['box'][-1] - w2['box'][-1]) < v_threshold) \
+                and (df_words[i]['label'] == df_words[j]['label']) \
+                and (abs(w1['box'][0] - w2['box'][0]) < h_threshold or abs(w1['box'][-2] - w2['box'][-2]) < h_threshold):
+                dfs(j,merged, width, height, visited, df_words)
+    return merged
+def createEntities(model, predictions, input_ids, ocr_df, tokenizer, img_size, bbox):
+    width, height = img_size
+    words = []
+    for index,row in ocr_df.iterrows():
+        word = {}
+        origin_box = [row['left'],row['top'],row['left']+row['width'],row['top']+row['height']]
+        word['word_text'] = row['text']
+        word['word_box'] = origin_box
+        word['normalized_box'] = normalize_box(word['word_box'], width, height)
+        words.append(word)
+    raw_input_ids = input_ids[0].tolist()
+    token_boxes = bbox.squeeze().tolist()
+    special_tokens = [tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id]
+    input_ids = [id for id in raw_input_ids if id not in special_tokens]
+    predictions = [model.config.id2label[prediction] for i,prediction in enumerate(predictions) if not (raw_input_ids[i] in special_tokens)]
+    actual_boxes = [box for i,box in enumerate(token_boxes) if not (raw_input_ids[i] in special_tokens )]
+    assert(len(actual_boxes) == len(predictions))
+    for word in words:
+        word_labels = []
+        token_labels = []
+        word_tagging = None
+        for i,box in enumerate(actual_boxes,start=0):
+            if compare_boxes(word['normalized_box'],box):
+                if predictions[i] != 'O':
+                    word_labels.append(predictions[i][2:])
+                else:
+                    word_labels.append('O')
+                token_labels.append(predictions[i])
+        if word_labels != []:
+            word_tagging =  word_labels[0] if word_labels[0] != 'O' else word_labels[-1]
+        else:
+            word_tagging = 'O'
+        word['word_labels'] = token_labels
+        word['word_tagging'] = word_tagging
+    filtered_words = [{'id':i,'text':word['word_text'],
+                'label':word['word_tagging'],
+                'box':word['word_box'],
+                'words':[{'box':word['word_box'],'text':word['word_text']}]} for i,word in enumerate(words) if word['word_tagging'] != 'O']
+    merged_taggings = []
+    df_words = filtered_words.copy()
+    visited = set()
+    for i in range(len(df_words)):
+        if i not in visited:
+            merged_taggings.append(dfs(i,[], width, height, visited, df_words))
+    merged_words = []
+    for i,merged_tagging in enumerate(merged_taggings):
+        if len(merged_tagging) > 1:
+            new_word = {}
+            merging_word = " ".join([word['text'] for word in merged_tagging])
+            merging_box = [merged_tagging[0]['box'][0]-5,merged_tagging[0]['box'][1]-10,merged_tagging[-1]['box'][2]+5,merged_tagging[-1]['box'][3]+10]
+            new_word['text'] = merging_word
+            new_word['box'] = merging_box
+            new_word['label'] = merged_tagging[0]['label']
+            new_word['id'] = filtered_words[-1]['id']+i+1
+            new_word['words'] = [{'box':word['box'],'text':word['text']} for word in merged_tagging]
+            # new_word['start'] =
+            merged_words.append(new_word)
+    filtered_words.extend(merged_words)
+    predictions = [word['label'] for word in filtered_words]
+    actual_boxes = [word['box'] for word in filtered_words]
+    unique_taggings = set(predictions)
+    output = convert_data(copy.deepcopy(merged_words), tokenizer, img_size)
+    return output, merged_words