Spaces:
Running
Running
File size: 2,451 Bytes
6ebf426 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import numpy as np
def mask_func(tokenized_sen):
if len(tokenized_sen) == 0:
return []
token_list = []
# for sen in tokenized_sen:
# for token in sen:
# token_list.append(token)
for sen in tokenized_sen:
token_list += sen.text.split(' ')
P = 0.5
ret_list = []
i = 0
mask_num = 0
while i < len(token_list):
t = token_list[i]
if '.' in t or '(' in t or ')' in t or '[' in t or ']' in t:
ret_list.append(t)
i += 1
mask_num = 0
else:
length = np.random.poisson(3)
if np.random.rand() < P and length > 0:
if mask_num < 8:
ret_list.append('<mask>')
mask_num += 1
i += length
else:
ret_list.append(t)
i += 1
mask_num = 0
return [' '.join(ret_list)]
def find_mini_span(vec, words, check_set):
def cal(text, sset):
add = 0
for tt in sset:
if tt in text:
add += 1
return add
text = ' '.join(words)
max_add = cal(text, check_set)
minn = 10000000
span = ''
rc = None
for i in range(len(vec)):
if vec[i] == True:
p = -1
for j in range(i+1, len(vec)+1):
if vec[j-1] == True:
text = ' '.join(words[i:j])
if cal(text, check_set) == max_add:
p = j
break
if p > 0:
if (p-i) < minn:
minn = p-i
span = ' '.join(words[i:p])
rc = (i, p)
if rc:
for i in range(rc[0], rc[1]):
vec[i] = True
return vec, span
def process(text):
for i in range(ord('A'), ord('Z')+1):
text = text.replace(f'.{chr(i)}', f'. {chr(i)}')
Left = ['(', '[', '{']
Right = [')', ']', '}']
for s in Left:
text = text.replace(s+' ', s)
for s in Right:
text = text.replace(' '+s, s)
for i in range(10):
text = text.replace(f'{i} %', f'{i}%')
text = text.replace(' .', '.')
text = text.replace(' ,', ',')
text = text.replace(' ?', '?')
text = text.replace(' !', '!')
text = text.replace(' :', ':')
text = text.replace(' ;', ';')
text = text.replace(' ', ' ')
return text |