File size: 962 Bytes
fd54b70
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
from nltk import word_tokenize
import wordninja
def get_average_words_per_line(lines):
            sum = 0
            for line in lines:
                tokens = word_tokenize(line)
                sum+= len(tokens)
            return sum/ len(lines)
    
def get_average_line_len(lines):
            sum = 0
            for line in lines:
                sum+=len(line)
            return sum / len(lines)
        
def percentage_difference(value1, value2):
    average_value = (value1 + value2) / 2
    diff = abs(value1 - value2)
    percentage_diff = (diff / average_value) * 100
    return percentage_diff      
        
def recover_text(line):
    tokens = word_tokenize(line)
    condition = percentage_difference(len(tokens), len(wordninja.split(line))) >  150 
    #condition = percentage_difference(line_width, len(tokens)) > percentage_difference(average_width, avg_tokens) 
    return " ".join(wordninja.split(line)) if condition else line