Spaces:
Running
Running
File size: 3,336 Bytes
c5ed230 d94ccbe c5ed230 35c29ec c5ed230 5854014 c5ed230 35c29ec c5ed230 5854014 35c29ec 5854014 c5ed230 5854014 c5ed230 5854014 c5ed230 5854014 c5ed230 5854014 c5ed230 5854014 c5ed230 5854014 c5ed230 5854014 c5ed230 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 |
import regex as re
import config
from .utils import check_is_none
from logger import logger
# 读取配置选择语种识别库
clf = getattr(config, "LANGUAGE_IDENTIFICATION_LIBRARY", "fastlid")
def clasify_lang(text, speaker_lang):
pattern = r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`' \
r'\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」' \
r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+'
words = re.split(pattern, text)
pre = ""
p = 0
if clf.upper() == "FASTLID" or clf.upper() == "FASTTEXT":
from fastlid import fastlid
detect = fastlid
if speaker_lang != None: fastlid.set_languages = speaker_lang
elif clf.upper() == "LANGID":
import langid
detect = langid.classify
if speaker_lang != None: langid.set_languages(speaker_lang)
else:
raise ValueError(f"Wrong LANGUAGE_IDENTIFICATION_LIBRARY in config.py")
for word in words:
if check_is_none(word): continue
lang = detect(word)[0]
if pre == "":
text = text[:p] + text[p:].replace(word, f'[{lang.upper()}]' + word, 1)
p += len(f'[{lang.upper()}]')
elif pre != lang:
text = text[:p] + text[p:].replace(word, f'[{pre.upper()}][{lang.upper()}]' + word, 1)
p += len(f'[{pre.upper()}][{lang.upper()}]')
pre = lang
p += text[p:].index(word) + len(word)
text += f"[{pre.upper()}]"
return text
def cut(text, max):
pattern = r'[!(),—+\-.:;??。,、;:]+'
sentences = re.split(pattern, text)
discarded_chars = re.findall(pattern, text)
sentence_list, count, p = [], 0, 0
# 按被分割的符号遍历
for i, discarded_chars in enumerate(discarded_chars):
count += len(sentences[i]) + len(discarded_chars)
if count >= max:
sentence_list.append(text[p:p + count].strip())
p += count
count = 0
# 加入最后剩余的文本
if p < len(text):
sentence_list.append(text[p:])
return sentence_list
def sentence_split(text, max=50, lang="auto", speaker_lang=None):
# 如果该speaker只支持一种语言
if speaker_lang is not None and len(speaker_lang) == 1:
if lang.upper() not in ["AUTO", "MIX"] and lang.lower() != speaker_lang[0]:
logger.debug(
f"lang \"{lang}\" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}")
lang = speaker_lang[0]
sentence_list = []
if lang.upper() != "MIX":
if max <= 0:
sentence_list.append(
clasify_lang(text,
speaker_lang) if lang.upper() == "AUTO" else f"[{lang.upper()}]{text}[{lang.upper()}]")
else:
for i in cut(text, max):
if check_is_none(i): continue
sentence_list.append(
clasify_lang(i,
speaker_lang) if lang.upper() == "AUTO" else f"[{lang.upper()}]{i}[{lang.upper()}]")
else:
sentence_list.append(text)
for i in sentence_list:
logger.debug(i)
return sentence_list
|