Spaces:

Innovex
/

ExCeipt

Running

App Files Files Community

ExCeipt / Layoutlmv3_inference /ocr2.py

Scezui

added the content editable and removed the whitespace delimiter and replaced it with getting the whole text

ee7e9d0 9 months ago

raw

history blame contribute delete

8.8 kB

	import os
	import pandas as pd
	import cv2
	import numpy as np
	import json
	import requests
	import traceback
	import tempfile

	FLASK_DEBUG=1
	from PIL import Image

	def preprocess_image(image_path, max_file_size_mb=1, target_file_size_mb=0.5):
	try:
	# Read the image
	image = cv2.imread(image_path)
	# Enhance text
	enhanced = enhance_txt(image)

	# Save the enhanced image to a temporary file
	temp_file_path = tempfile.NamedTemporaryFile(suffix='.jpg').name
	cv2.imwrite(temp_file_path, enhanced)

	# Check file size of the temporary file
	file_size_mb = os.path.getsize(temp_file_path) / (1024 * 1024) # Convert to megabytes

	while file_size_mb > max_file_size_mb:
	print(f"File size ({file_size_mb} MB) exceeds the maximum allowed size ({max_file_size_mb} MB). Resizing the image.")
	ratio = np.sqrt(target_file_size_mb / file_size_mb)
	new_width = int(image.shape[1] * ratio)
	new_height = int(image.shape[0] * ratio)

	# Resize the image
	enhanced = cv2.resize(enhanced, (new_width, new_height))

	# Save the resized image to a temporary file
	temp_file_path = tempfile.NamedTemporaryFile(suffix='.jpg').name
	cv2.imwrite(temp_file_path, enhanced)

	# Update file size
	file_size_mb = os.path.getsize(temp_file_path) / (1024 * 1024)
	print(f"New file size: ({file_size_mb} MB)")

	# Return the final resized image
	image_resized = cv2.imread(temp_file_path)
	return image_resized

	except Exception as e:
	print(f"An error occurred in preprocess_image: {str(e)}")
	return None


	def enhance_txt(img, intensity_increase=20, bilateral_filter_diameter=9, bilateral_filter_sigma_color=75, bilateral_filter_sigma_space=75):
	# Get the width and height of the image
	w = img.shape[1]
	h = img.shape[0]
	w1 = int(w * 0.05)
	w2 = int(w * 0.95)
	h1 = int(h * 0.05)
	h2 = int(h * 0.95)
	ROI = img[h1:h2, w1:w2] # 95% of the center of the image
	threshold = np.mean(ROI) * 0.88 # % of average brightness

	# Convert image to grayscale
	grayscale_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

	# Apply Gaussian blur
	blurred = cv2.GaussianBlur(grayscale_img, (1, 1), 0)

	edged = 255 - cv2.Canny(blurred, 100, 150, apertureSize=7)

	# Increase intensity by adding a constant value
	img = np.clip(img + intensity_increase, 0, 255).astype(np.uint8)

	# Apply bilateral filter to reduce noise
	img = cv2.bilateralFilter(img, bilateral_filter_diameter, bilateral_filter_sigma_color, bilateral_filter_sigma_space)

	_, binary = cv2.threshold(blurred, threshold, 255, cv2.THRESH_BINARY)

	# Find contours in the edged image, keep only the largest ones, and initialize our screen contour
	contours, _ = cv2.findContours(edged.copy(), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
	contours = sorted(contours, key = cv2.contourArea, reverse = True)[:5]

	# Initialize a variable to hold the screen contour
	screenContour = None

	# Loop over the contours
	for c in contours:
	# Approximate the contour
	peri = cv2.arcLength(c, True)
	approx = cv2.approxPolyDP(c, 0.02 * peri, True)

	# If our approximated contour has four points, then we can assume that we have found our screen
	if len(approx) == 4:
	screenContour = approx
	break

	# If no contour is found or the contour is small, use the whole image
	if screenContour is None or cv2.contourArea(screenContour) < 500:
	screenContour = np.array([[[0, 0]], [[w-1, 0]], [[w-1, h-1]], [[0, h-1]]])

	# Get the bounding rectangle around the contour
	x, y, w, h = cv2.boundingRect(screenContour)

	# Check if the bounding rectangle is within the image boundaries
	if x >= 0 and y >= 0 and x + w <= img.shape[1] and y + h <= img.shape[0]:
	# Crop the image using the bounding rectangle
	cropped_img = img[y:y+h, x:x+w]
	else:
	print("Bounding rectangle is out of image boundaries")
	cropped_img = img

	return cropped_img

	def run_tesseract_on_preprocessed_image(preprocessed_image, image_path):
	try:
	image_name = os.path.basename(image_path)
	image_name = image_name[:image_name.find('.')]

	# Create the "temp" folder if it doesn't exist
	temp_folder = "static/temp"
	if not os.path.exists(temp_folder):
	os.makedirs(temp_folder)

	# Define the OCR API endpoint
	url = "https://api.ocr.space/parse/image"

	# Define the API key and the language
	api_key = "K88232854988957" # Replace with your actual OCR Space API key
	language = "eng"

	# Save the preprocessed image
	cv2.imwrite(os.path.join(temp_folder, f"{image_name}_preprocessed.jpg"), preprocessed_image)

	# Open the preprocessed image file as binary
	with open(os.path.join(temp_folder, f"{image_name}_preprocessed.jpg"), "rb") as f:
	# Define the payload for the API request
	payload = {
	"apikey": api_key,
	"language": language,
	"isOverlayRequired": True,
	"OCREngine": 2
	}
	# Define the file parameter for the API request
	file = {
	"file": f
	}
	# Send the POST request to the OCR API
	response = requests.post(url, data=payload, files=file)

	# Check the status code of the response
	if response.status_code == 200:
	# Parse the JSON response
	result = response.json()
	print("---JSON file saved")
	# Save the OCR result as JSON
	with open(os.path.join(temp_folder, f"{image_name}_ocr.json"), 'w') as f:
	json.dump(result, f)

	return os.path.join(temp_folder, f"{image_name}_ocr.json")
	else:
	# Print the error message
	print("Error: " + response.text)
	return None

	except Exception as e:
	print(f"An error occurred during OCR request: {str(e)}")
	return None

	def clean_tesseract_output(json_output_path):
	try:
	with open(json_output_path, 'r') as json_file:
	data = json.load(json_file)

	lines = data['ParsedResults'][0]['TextOverlay']['Lines']

	words = []
	for line in lines:
	for word_info in line['Words']:
	word = {}
	origin_box = [
	word_info['Left'],
	word_info['Top'],
	word_info['Left'] + word_info['Width'],
	word_info['Top'] + word_info['Height']
	]

	word['word_text'] = word_info['WordText']
	word['word_box'] = origin_box
	words.append(word)

	return words
	except (KeyError, IndexError, FileNotFoundError, json.JSONDecodeError) as e:
	print(f"Error cleaning Tesseract output: {str(e)}")
	return None

	def prepare_batch_for_inference(image_paths):
	# print("my_function was called")
	# traceback.print_stack() # This will print the stack trace
	print(f"Number of images to process: {len(image_paths)}") # Print the total number of images to be processed
	print("1. Preparing for Inference")
	tsv_output_paths = []

	inference_batch = dict()
	print("2. Starting Preprocessing")
	# Ensure that the image is only 1
	for image_path in image_paths:
	print(f"Processing the image: {image_path}") # Print the image being processed
	print("3. Preprocessing the Receipt")
	preprocessed_image = preprocess_image(image_path)
	if preprocessed_image is not None:
	print("4. Preprocessing done. Running OCR")
	json_output_path = run_tesseract_on_preprocessed_image(preprocessed_image, image_path)
	print("5. OCR Complete")
	if json_output_path:
	tsv_output_paths.append(json_output_path)

	print("6. Preprocessing and OCR Done")
	# clean_outputs is a list of lists
	clean_outputs = [clean_tesseract_output(tsv_path) for tsv_path in tsv_output_paths]
	print("7. Cleaned OCR output")
	word_lists = [[word['word_text'] for word in clean_output] for clean_output in clean_outputs]
	print("8. Word List Created")
	boxes_lists = [[word['word_box'] for word in clean_output] for clean_output in clean_outputs]
	print("9. Box List Created")
	inference_batch = {
	"image_path": image_paths,
	"bboxes": boxes_lists,
	"words": word_lists
	}

	print("10. Prepared for Inference Batch")
	return inference_batch