shreyasvaidya's picture
Upload folder using huggingface_hub
01bb3bb verified
import numpy as np
# def detect_para(bbox_dict):
# alpha1 = 0.2
# alpha2 = 0.7
# beta1 = 0.4
# data = bbox_dict
# word_crops = list(data.keys())
# for i in word_crops:
# data[i]["x1"], data[i]["y1"], data[i]["x2"], data[i]["y2"] = data[i]["bbox"]
# data[i]["xc"] = (data[i]["x1"] + data[i]["x2"]) / 2
# data[i]["yc"] = (data[i]["y1"] + data[i]["y2"]) / 2
# data[i]["w"] = data[i]["x2"] - data[i]["x1"]
# data[i]["h"] = data[i]["y2"] - data[i]["y1"]
# patch_info = {}
# while word_crops:
# img_name = word_crops[0].split("_")[0]
# word_crop_collection = [
# word_crop for word_crop in word_crops if word_crop.startswith(img_name)
# ]
# centroids = {}
# lines = []
# img_word_crops = word_crop_collection.copy()
# para = []
# while img_word_crops:
# clusters = []
# para_words_group = [
# img_word_crops[0],
# ]
# added = [
# img_word_crops[0],
# ]
# img_word_crops.remove(img_word_crops[0])
# ## determining the paragraph
# while added:
# word_crop = added.pop()
# for i in range(len(img_word_crops)):
# word_crop_ = img_word_crops[i]
# if (
# abs(data[word_crop_]["yc"] - data[word_crop]["yc"])
# < data[word_crop]["h"] * alpha1
# ):
# if data[word_crop]["xc"] > data[word_crop_]["xc"]:
# if (data[word_crop]["x1"] - data[word_crop_]["x2"]) < data[
# word_crop
# ]["h"] * alpha2:
# para_words_group.append(word_crop_)
# added.append(word_crop_)
# else:
# if (data[word_crop_]["x1"] - data[word_crop]["x2"]) < data[
# word_crop
# ]["h"] * alpha2:
# para_words_group.append(word_crop_)
# added.append(word_crop_)
# else:
# if data[word_crop]["yc"] > data[word_crop_]["yc"]:
# if (data[word_crop]["y1"] - data[word_crop_]["y2"]) < data[
# word_crop
# ]["h"] * beta1 and (
# (
# (data[word_crop_]["x1"] < data[word_crop]["x2"])
# and (data[word_crop_]["x1"] > data[word_crop]["x1"])
# )
# or (
# (data[word_crop_]["x2"] < data[word_crop]["x2"])
# and (data[word_crop_]["x2"] > data[word_crop]["x1"])
# )
# or (
# (data[word_crop]["x1"] > data[word_crop_]["x1"])
# and (data[word_crop]["x2"] < data[word_crop_]["x2"])
# )
# ):
# para_words_group.append(word_crop_)
# added.append(word_crop_)
# else:
# if (data[word_crop_]["y1"] - data[word_crop]["y2"]) < data[
# word_crop
# ]["h"] * beta1 and (
# (
# (data[word_crop_]["x1"] < data[word_crop]["x2"])
# and (data[word_crop_]["x1"] > data[word_crop]["x1"])
# )
# or (
# (data[word_crop_]["x2"] < data[word_crop]["x2"])
# and (data[word_crop_]["x2"] > data[word_crop]["x1"])
# )
# or (
# (data[word_crop]["x1"] > data[word_crop_]["x1"])
# and (data[word_crop]["x2"] < data[word_crop_]["x2"])
# )
# ):
# para_words_group.append(word_crop_)
# added.append(word_crop_)
# img_word_crops = [p for p in img_word_crops if p not in para_words_group]
# ## processing for the line
# while para_words_group:
# line_words_group = [
# para_words_group[0],
# ]
# added = [
# para_words_group[0],
# ]
# para_words_group.remove(para_words_group[0])
# ## determining the line
# while added:
# word_crop = added.pop()
# for i in range(len(para_words_group)):
# word_crop_ = para_words_group[i]
# if (
# abs(data[word_crop_]["yc"] - data[word_crop]["yc"])
# < data[word_crop]["h"] * alpha1
# ):
# if data[word_crop]["xc"] > data[word_crop_]["xc"]:
# if (data[word_crop]["x1"] - data[word_crop_]["x2"]) < data[
# word_crop
# ]["h"] * alpha2:
# line_words_group.append(word_crop_)
# added.append(word_crop_)
# else:
# if (data[word_crop_]["x1"] - data[word_crop]["x2"]) < data[
# word_crop
# ]["h"] * alpha2:
# line_words_group.append(word_crop_)
# added.append(word_crop_)
# para_words_group = [
# p for p in para_words_group if p not in line_words_group
# ]
# xc = [data[word_crop]["xc"] for word_crop in line_words_group]
# idxs = np.argsort(xc)
# patch_cluster_ = [line_words_group[i] for i in idxs]
# line_words_group = patch_cluster_
# x1 = [data[word_crop]["x1"] for word_crop in line_words_group]
# x2 = [data[word_crop]["x2"] for word_crop in line_words_group]
# y1 = [data[word_crop]["y1"] for word_crop in line_words_group]
# y2 = [data[word_crop]["y2"] for word_crop in line_words_group]
# txt_line = [data[word_crop]["txt"] for word_crop in line_words_group]
# txt = " ".join(txt_line)
# x = [x1[0]]
# y1_ = [y1[0]]
# y2_ = [y2[0]]
# l = [len(txt_l) for txt_l in txt_line]
# for i in range(1, len(x1)):
# x.append((x1[i] + x2[i - 1]) / 2)
# y1_.append((y1[i] + y1[i - 1]) / 2)
# y2_.append((y2[i] + y2[i - 1]) / 2)
# x.append(x2[-1])
# y1_.append(y1[-1])
# y2_.append(y2[-1])
# line_info = {
# "x": x,
# "y1": y1_,
# "y2": y2_,
# "l": l,
# "txt": txt,
# "word_crops": line_words_group,
# }
# clusters.append(line_info)
# y_ = [clusters[i]["y1"][0] for i in range(len(clusters))]
# idxs = np.argsort(y_)
# clusters_ = [clusters[i] for i in idxs]
# txt = [clusters[i]["txt"] for i in idxs]
# l = [len(t) for t in txt]
# txt = " ".join(txt)
# para_info = {"lines": clusters_, "l": l, "txt": txt}
# para.append(para_info)
# for word_crop in word_crop_collection:
# word_crops.remove(word_crop)
# return "\n".join([para[i]["txt"] for i in range(len(para))])
def detect_para(recognized_texts):
"""
Sort words into lines based on horizontal overlap of bounding boxes.
Args:
recognized_texts (dict): A dictionary with recognized texts as keys and bounding boxes as values.
Each bounding box is a list of points [x1, y1, x2, y2].
Returns:
list: A list of lists where each sublist contains words sorted by x-coordinate for a single line.
"""
def calculate_overlap(bbox1, bbox2):
"""Calculate the vertical overlap between two bounding boxes."""
# Extract bounding box coordinates
x1_1, y1_1, x2_1, y2_1 = bbox1
x1_2, y1_2, x2_2, y2_2 = bbox2
overlap = max(0, min(y2_1, y2_2) - max(y1_1, y1_2))
height = min(y2_1 - y1_1, y2_2 - y1_2)
return overlap / height if height > 0 else 0
# Convert recognized_texts dictionary to a list of tuples for processing
items = list(recognized_texts.items())
lines = []
while items:
current_image, current_data = items.pop(0)
current_text, current_bbox = current_data['txt'], current_data['bbox']
current_line = [(current_text, current_bbox)]
remaining_items = []
for image, data in items:
text, bbox = data['txt'], data['bbox']
if calculate_overlap(current_bbox, bbox) > 0.4:
current_line.append((text, bbox))
else:
remaining_items.append((image, data))
items = remaining_items
lines.append(current_line)
# Sort words within each line based on x1 (horizontal position)
sorted_lines = [
[text for text, bbox in sorted(line, key=lambda x: x[1][0])] for line in lines
]
return sorted_lines