khrek commited on
Commit
fd54b70
1 Parent(s): 822daa6

Upload utils.py

Browse files
Files changed (1) hide show
  1. utils.py +28 -0
utils.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from nltk import word_tokenize
2
+ import wordninja
3
+ def get_average_words_per_line(lines):
4
+ sum = 0
5
+ for line in lines:
6
+ tokens = word_tokenize(line)
7
+ sum+= len(tokens)
8
+ return sum/ len(lines)
9
+
10
+ def get_average_line_len(lines):
11
+ sum = 0
12
+ for line in lines:
13
+ sum+=len(line)
14
+ return sum / len(lines)
15
+
16
+ def percentage_difference(value1, value2):
17
+ average_value = (value1 + value2) / 2
18
+ diff = abs(value1 - value2)
19
+ percentage_diff = (diff / average_value) * 100
20
+ return percentage_diff
21
+
22
+ def recover_text(line):
23
+ tokens = word_tokenize(line)
24
+ condition = percentage_difference(len(tokens), len(wordninja.split(line))) > 150
25
+ #condition = percentage_difference(line_width, len(tokens)) > percentage_difference(average_width, avg_tokens)
26
+ return " ".join(wordninja.split(line)) if condition else line
27
+
28
+