hdeldar commited on
Commit
d60645b
1 Parent(s): 3440b09

add utlis.py

Browse files
Files changed (1) hide show
  1. utlis.py +75 -0
utlis.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import re
3
+ import cv2
4
+
5
+ def read_custum_html(file_name):
6
+ with open(file_name) as f:
7
+ return f.read()
8
+
9
+ def get_lines(result):
10
+ lines_dict = {}
11
+ l=0
12
+ lines_dict[0]=[]
13
+ cord = result[0][0]
14
+ x_min, y_min = [int(min(idx)) for idx in zip(*cord)]
15
+ x_max, y_max = [int(max(idx)) for idx in zip(*cord)]
16
+ lines_dict[0].append([[x_max, y_min], result[0][1]])
17
+ y_min_prev = lines_dict[0][0][0][1]
18
+ for i in range(1, len(result)):
19
+ cord = result[i][0]
20
+ x_min, y_min = [int(min(idx)) for idx in zip(*cord)]
21
+ x_max, y_max = [int(max(idx)) for idx in zip(*cord)]
22
+ if y_min-y_min_prev<18:
23
+ lines_dict[l].append([[x_max, y_min], result[i][1]])
24
+ y_min_prev = y_min
25
+ else:
26
+ l= l+1
27
+ lines_dict[l]=[]
28
+ lines_dict[l].append([[x_max, y_min], result[i][1]])
29
+ y_min_prev = y_max
30
+ return lines_dict
31
+
32
+ def annotate_image(image, result):
33
+ for i in range(len(result)):
34
+ cord = result[i][0]
35
+ x_min, y_min = [int(min(idx)) for idx in zip(*cord)]
36
+ x_max, y_max = [int(max(idx)) for idx in zip(*cord)]
37
+ cv2.rectangle(image,(x_min,y_min),(x_max,y_max),(0,0,255),1)
38
+ return image
39
+
40
+ def arrange_words_in_line(lines_dict):
41
+ if isinstance(lines_dict, dict):
42
+ arranged_dict = {}
43
+ for key, values in lines_dict.items():
44
+ line = lines_dict[key]
45
+ sorted_line = sorted(line,key=lambda x:x[0][0], reverse=True)
46
+ arranged_dict[key] = sorted_line
47
+ return arranged_dict
48
+ else:
49
+ raise TypeError("The arg must be dict of lines")
50
+
51
+ def get_raw_text(result):
52
+ lines_dict = get_lines(result)
53
+ arranged_lines_dict = arrange_words_in_line(lines_dict)
54
+ text_list = []
55
+ for i in range(len(arranged_lines_dict.keys())):
56
+ for j in range (len(arranged_lines_dict[i])):
57
+ line_text = arranged_lines_dict[i][j][1]
58
+ text_list.append(line_text)
59
+ text_list.append('\n')
60
+ raw_text = ' '.join(text_list)
61
+ raw_text = replace_en_num(raw_text)
62
+ return raw_text
63
+
64
+ def replace_en_num(text):
65
+ text = re.sub("0", "\u0660", text)
66
+ text = re.sub("1", "\u0661", text)
67
+ text = re.sub("2", "\u0662", text)
68
+ text = re.sub("3", "\u0663", text)
69
+ text = re.sub("4", "\u0664", text)
70
+ text = re.sub("5", "\u0665", text)
71
+ text = re.sub("6", "\u0666", text)
72
+ text = re.sub("6", "\u0667", text)
73
+ text = re.sub("8", "\u0668", text)
74
+ text = re.sub("9", "\u0669", text)
75
+ return text