Spaces:
Running
Running
Upload utils.py
Browse files
utils.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from nltk import word_tokenize
|
2 |
+
import wordninja
|
3 |
+
def get_average_words_per_line(lines):
|
4 |
+
sum = 0
|
5 |
+
for line in lines:
|
6 |
+
tokens = word_tokenize(line)
|
7 |
+
sum+= len(tokens)
|
8 |
+
return sum/ len(lines)
|
9 |
+
|
10 |
+
def get_average_line_len(lines):
|
11 |
+
sum = 0
|
12 |
+
for line in lines:
|
13 |
+
sum+=len(line)
|
14 |
+
return sum / len(lines)
|
15 |
+
|
16 |
+
def percentage_difference(value1, value2):
|
17 |
+
average_value = (value1 + value2) / 2
|
18 |
+
diff = abs(value1 - value2)
|
19 |
+
percentage_diff = (diff / average_value) * 100
|
20 |
+
return percentage_diff
|
21 |
+
|
22 |
+
def recover_text(line):
|
23 |
+
tokens = word_tokenize(line)
|
24 |
+
condition = percentage_difference(len(tokens), len(wordninja.split(line))) > 150
|
25 |
+
#condition = percentage_difference(line_width, len(tokens)) > percentage_difference(average_width, avg_tokens)
|
26 |
+
return " ".join(wordninja.split(line)) if condition else line
|
27 |
+
|
28 |
+
|