Spaces:
Running
Running
import numpy as np | |
def mask_func(tokenized_sen): | |
if len(tokenized_sen) == 0: | |
return [] | |
token_list = [] | |
# for sen in tokenized_sen: | |
# for token in sen: | |
# token_list.append(token) | |
for sen in tokenized_sen: | |
token_list += sen.text.split(' ') | |
P = 0.5 | |
ret_list = [] | |
i = 0 | |
mask_num = 0 | |
while i < len(token_list): | |
t = token_list[i] | |
if '.' in t or '(' in t or ')' in t or '[' in t or ']' in t: | |
ret_list.append(t) | |
i += 1 | |
mask_num = 0 | |
else: | |
length = np.random.poisson(3) | |
if np.random.rand() < P and length > 0: | |
if mask_num < 8: | |
ret_list.append('<mask>') | |
mask_num += 1 | |
i += length | |
else: | |
ret_list.append(t) | |
i += 1 | |
mask_num = 0 | |
return [' '.join(ret_list)] | |
def find_mini_span(vec, words, check_set): | |
def cal(text, sset): | |
add = 0 | |
for tt in sset: | |
if tt in text: | |
add += 1 | |
return add | |
text = ' '.join(words) | |
max_add = cal(text, check_set) | |
minn = 10000000 | |
span = '' | |
rc = None | |
for i in range(len(vec)): | |
if vec[i] == True: | |
p = -1 | |
for j in range(i+1, len(vec)+1): | |
if vec[j-1] == True: | |
text = ' '.join(words[i:j]) | |
if cal(text, check_set) == max_add: | |
p = j | |
break | |
if p > 0: | |
if (p-i) < minn: | |
minn = p-i | |
span = ' '.join(words[i:p]) | |
rc = (i, p) | |
if rc: | |
for i in range(rc[0], rc[1]): | |
vec[i] = True | |
return vec, span | |
def process(text): | |
for i in range(ord('A'), ord('Z')+1): | |
text = text.replace(f'.{chr(i)}', f'. {chr(i)}') | |
Left = ['(', '[', '{'] | |
Right = [')', ']', '}'] | |
for s in Left: | |
text = text.replace(s+' ', s) | |
for s in Right: | |
text = text.replace(' '+s, s) | |
for i in range(10): | |
text = text.replace(f'{i} %', f'{i}%') | |
text = text.replace(' .', '.') | |
text = text.replace(' ,', ',') | |
text = text.replace(' ?', '?') | |
text = text.replace(' !', '!') | |
text = text.replace(' :', ':') | |
text = text.replace(' ;', ';') | |
text = text.replace(' ', ' ') | |
return text |