Scorpius_HF / server /server_utils.py
yjwtheonly
server
6ebf426
raw
history blame
No virus
2.45 kB
import numpy as np
def mask_func(tokenized_sen):
if len(tokenized_sen) == 0:
return []
token_list = []
# for sen in tokenized_sen:
# for token in sen:
# token_list.append(token)
for sen in tokenized_sen:
token_list += sen.text.split(' ')
P = 0.5
ret_list = []
i = 0
mask_num = 0
while i < len(token_list):
t = token_list[i]
if '.' in t or '(' in t or ')' in t or '[' in t or ']' in t:
ret_list.append(t)
i += 1
mask_num = 0
else:
length = np.random.poisson(3)
if np.random.rand() < P and length > 0:
if mask_num < 8:
ret_list.append('<mask>')
mask_num += 1
i += length
else:
ret_list.append(t)
i += 1
mask_num = 0
return [' '.join(ret_list)]
def find_mini_span(vec, words, check_set):
def cal(text, sset):
add = 0
for tt in sset:
if tt in text:
add += 1
return add
text = ' '.join(words)
max_add = cal(text, check_set)
minn = 10000000
span = ''
rc = None
for i in range(len(vec)):
if vec[i] == True:
p = -1
for j in range(i+1, len(vec)+1):
if vec[j-1] == True:
text = ' '.join(words[i:j])
if cal(text, check_set) == max_add:
p = j
break
if p > 0:
if (p-i) < minn:
minn = p-i
span = ' '.join(words[i:p])
rc = (i, p)
if rc:
for i in range(rc[0], rc[1]):
vec[i] = True
return vec, span
def process(text):
for i in range(ord('A'), ord('Z')+1):
text = text.replace(f'.{chr(i)}', f'. {chr(i)}')
Left = ['(', '[', '{']
Right = [')', ']', '}']
for s in Left:
text = text.replace(s+' ', s)
for s in Right:
text = text.replace(' '+s, s)
for i in range(10):
text = text.replace(f'{i} %', f'{i}%')
text = text.replace(' .', '.')
text = text.replace(' ,', ',')
text = text.replace(' ?', '?')
text = text.replace(' !', '!')
text = text.replace(' :', ':')
text = text.replace(' ;', ';')
text = text.replace(' ', ' ')
return text