import numpy as np def mask_func(tokenized_sen): if len(tokenized_sen) == 0: return [] token_list = [] # for sen in tokenized_sen: # for token in sen: # token_list.append(token) for sen in tokenized_sen: token_list += sen.text.split(' ') P = 0.5 ret_list = [] i = 0 mask_num = 0 while i < len(token_list): t = token_list[i] if '.' in t or '(' in t or ')' in t or '[' in t or ']' in t: ret_list.append(t) i += 1 mask_num = 0 else: length = np.random.poisson(3) if np.random.rand() < P and length > 0: if mask_num < 8: ret_list.append('') mask_num += 1 i += length else: ret_list.append(t) i += 1 mask_num = 0 return [' '.join(ret_list)] def find_mini_span(vec, words, check_set): def cal(text, sset): add = 0 for tt in sset: if tt in text: add += 1 return add text = ' '.join(words) max_add = cal(text, check_set) minn = 10000000 span = '' rc = None for i in range(len(vec)): if vec[i] == True: p = -1 for j in range(i+1, len(vec)+1): if vec[j-1] == True: text = ' '.join(words[i:j]) if cal(text, check_set) == max_add: p = j break if p > 0: if (p-i) < minn: minn = p-i span = ' '.join(words[i:p]) rc = (i, p) if rc: for i in range(rc[0], rc[1]): vec[i] = True return vec, span def process(text): for i in range(ord('A'), ord('Z')+1): text = text.replace(f'.{chr(i)}', f'. {chr(i)}') Left = ['(', '[', '{'] Right = [')', ']', '}'] for s in Left: text = text.replace(s+' ', s) for s in Right: text = text.replace(' '+s, s) for i in range(10): text = text.replace(f'{i} %', f'{i}%') text = text.replace(' .', '.') text = text.replace(' ,', ',') text = text.replace(' ?', '?') text = text.replace(' !', '!') text = text.replace(' :', ':') text = text.replace(' ;', ';') text = text.replace(' ', ' ') return text