Spaces:
Running
Running
File size: 962 Bytes
fd54b70 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 |
from nltk import word_tokenize
import wordninja
def get_average_words_per_line(lines):
sum = 0
for line in lines:
tokens = word_tokenize(line)
sum+= len(tokens)
return sum/ len(lines)
def get_average_line_len(lines):
sum = 0
for line in lines:
sum+=len(line)
return sum / len(lines)
def percentage_difference(value1, value2):
average_value = (value1 + value2) / 2
diff = abs(value1 - value2)
percentage_diff = (diff / average_value) * 100
return percentage_diff
def recover_text(line):
tokens = word_tokenize(line)
condition = percentage_difference(len(tokens), len(wordninja.split(line))) > 150
#condition = percentage_difference(line_width, len(tokens)) > percentage_difference(average_width, avg_tokens)
return " ".join(wordninja.split(line)) if condition else line
|