AuditEdge's picture
initial commit
81e13bb
import os
import pandas as pd
import os
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "./titanium-scope-436311-t3-966373f5aa2f.json"
def run_tesseract_on_image(image_path): # -> tsv output path
print("image_path",image_path)
image_name = os.path.basename(image_path)
image_name = image_name[:image_name.find('.')]
error_code = os.system(f'''
tesseract "{image_path}" "/content/{image_name}" -l eng tsv
''')
if not error_code:
return f"/content/{image_name}.tsv"
else:
raise ValueError('Tesseract OCR Error please verify image format PNG,JPG,JPEG')
def clean_tesseract_output(tsv_output_path):
print("tsv_output_path",tsv_output_path)
ocr_df = pd.read_csv(tsv_output_path, sep='\t')
ocr_df = ocr_df.dropna()
ocr_df = ocr_df.drop(ocr_df[ocr_df.text.str.strip() == ''].index)
text_output = ' '.join(ocr_df.text.tolist())
words = []
for index, row in ocr_df.iterrows():
word = {}
origin_box = [row['left'], row['top'], row['left'] +
row['width'], row['top']+row['height']]
word['word_text'] = row['text']
word['word_box'] = origin_box
words.append(word)
return words
def detect_text(path):
print("this is path:",path)
"""Detects text in the file."""
from google.cloud import vision
client = vision.ImageAnnotatorClient()
with open(path, "rb") as image_file:
content = image_file.read()
image = vision.Image(content=content)
response = client.text_detection(image=image)
texts = response.text_annotations
print("Texts:")
list_of_dict = []
for text in texts[1:]:
data_dic = {}
print(f'\n"{text.description}"')
data_dic["word_text"] = text.description
vertices_list = [[int(vertex.x),int(vertex.y)] for vertex in text.bounding_poly.vertices]
print("vertices_list",vertices_list)
coords = vertices_list
sorted_coords = sorted(coords, key=lambda coord: (coord[0] + coord[1]))
# Top-left is the first in the sorted list (smallest sum of x, y)
top_left = sorted_coords[0]
# Bottom-right is the last in the sorted list (largest sum of x, y)
bottom_right = sorted_coords[-1]
ls = []
ls.append(top_left[0])
ls.append(top_left[1])
ls.append(bottom_right[0])
ls.append(bottom_right[1])
# print(ls)
# ls = []
# ls.append(vertices_list[0][0])
# ls.append(vertices_list[0][1])
# ls.append(vertices_list[2][0])
# ls.append(vertices_list[2][1])
data_dic["word_box"] = ls
list_of_dict.append(data_dic)
if response.error.message:
raise Exception(
"{}\nFor more info on error messages, check: "
"https://cloud.google.com/apis/design/errors".format(response.error.message)
)
return list_of_dict
def prepare_batch_for_inference(image_paths):
# tesseract_outputs is a list of paths
inference_batch = dict()
# tesseract_outputs = [run_tesseract_on_image(
# image_path) for image_path in image_paths]
# tesseract_outputs = []
# for image_path in image_paths:
# output = run_tesseract_on_image(image_path)
# tesseract_outputs.append(output)
# clean_outputs is a list of lists
# clean_outputs = [clean_tesseract_output(
# tsv_path) for tsv_path in tesseract_outputs]
# clean_outputs = []
# for tsv_path in tesseract_outputs:
# output = clean_tesseract_output(tsv_path)
# clean_outputs.append(output)
clean_outputs = []
for image_path in image_paths:
output = detect_text(image_path)
clean_outputs.append(output)
print("clean_outputs",clean_outputs)
word_lists = [[word['word_text'] for word in clean_output]
for clean_output in clean_outputs]
boxes_lists = [[word['word_box'] for word in clean_output]
for clean_output in clean_outputs]
inference_batch = {
"image_path": image_paths,
"bboxes": boxes_lists,
"words": word_lists
}
return inference_batch