File size: 2,451 Bytes
6ebf426
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import numpy as np

def mask_func(tokenized_sen):

    if len(tokenized_sen) == 0:
        return []
    token_list = []
    # for sen in tokenized_sen:
    #     for token in sen:
    #         token_list.append(token)
    for sen in tokenized_sen:
        token_list += sen.text.split(' ')
    P = 0.5

    ret_list = []
    i = 0
    mask_num = 0
    while i < len(token_list):
        t = token_list[i]
        if '.' in t or '(' in t or ')' in t or '[' in t or ']' in t:
            ret_list.append(t)
            i += 1
            mask_num = 0
        else:
            length = np.random.poisson(3)
            if np.random.rand() < P and length > 0:
                if mask_num < 8:
                    ret_list.append('<mask>')
                    mask_num += 1
                i += length
            else:
                ret_list.append(t)
                i += 1
                mask_num = 0
    return [' '.join(ret_list)]

def find_mini_span(vec, words, check_set):
            
    def cal(text, sset):
        add = 0
        for tt in sset:
            if tt in text:
                add += 1
        return add
    text = ' '.join(words)
    max_add = cal(text, check_set)

    minn = 10000000
    span = ''
    rc = None
    for i  in range(len(vec)):
        if vec[i] == True:
            p = -1
            for j in range(i+1, len(vec)+1):
                if vec[j-1] == True:
                    text = ' '.join(words[i:j])
                    if cal(text, check_set) == max_add:
                        p = j
                        break
            if p > 0:
                if (p-i) < minn:
                    minn = p-i
                    span = ' '.join(words[i:p])
                    rc = (i, p)
    if rc:
        for i in range(rc[0], rc[1]):
            vec[i] = True
    return vec, span

def process(text):

    for i in range(ord('A'), ord('Z')+1):
        text = text.replace(f'.{chr(i)}', f'. {chr(i)}')   
    Left = ['(', '[', '{']
    Right = [')', ']', '}']
    for s in Left:
        text = text.replace(s+' ', s) 
    for s in Right:
        text = text.replace(' '+s, s)
    for i in range(10):
        text = text.replace(f'{i} %', f'{i}%')
    text = text.replace(' .', '.')
    text = text.replace(' ,', ',')
    text = text.replace(' ?', '?')
    text = text.replace(' !', '!')
    text = text.replace(' :', ':')
    text = text.replace(' ;', ';')
    text = text.replace('  ', ' ')
    return text