import regex as re import config from .utils import check_is_none from logger import logger # 读取配置选择语种识别库 clf = getattr(config, "LANGUAGE_IDENTIFICATION_LIBRARY", "fastlid") def clasify_lang(text, speaker_lang): pattern = r'[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\>\=\?\@\[\]\{\}\\\\\^\_\`' \ r'\!?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」' \ r'『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘\'\‛\“\”\„\‟…‧﹏.]+' words = re.split(pattern, text) pre = "" p = 0 if clf.upper() == "FASTLID" or clf.upper() == "FASTTEXT": from fastlid import fastlid detect = fastlid if speaker_lang != None: fastlid.set_languages = speaker_lang elif clf.upper() == "LANGID": import langid detect = langid.classify if speaker_lang != None: langid.set_languages(speaker_lang) else: raise ValueError(f"Wrong LANGUAGE_IDENTIFICATION_LIBRARY in config.py") for word in words: if check_is_none(word): continue lang = detect(word)[0] if pre == "": text = text[:p] + text[p:].replace(word, f'[{lang.upper()}]' + word, 1) p += len(f'[{lang.upper()}]') elif pre != lang: text = text[:p] + text[p:].replace(word, f'[{pre.upper()}][{lang.upper()}]' + word, 1) p += len(f'[{pre.upper()}][{lang.upper()}]') pre = lang p += text[p:].index(word) + len(word) text += f"[{pre.upper()}]" return text def cut(text, max): pattern = r'[!(),—+\-.:;??。,、;:]+' sentences = re.split(pattern, text) discarded_chars = re.findall(pattern, text) sentence_list, count, p = [], 0, 0 # 按被分割的符号遍历 for i, discarded_chars in enumerate(discarded_chars): count += len(sentences[i]) + len(discarded_chars) if count >= max: sentence_list.append(text[p:p + count].strip()) p += count count = 0 # 加入最后剩余的文本 if p < len(text): sentence_list.append(text[p:]) return sentence_list def sentence_split(text, max=50, lang="auto", speaker_lang=None): # 如果该speaker只支持一种语言 if speaker_lang is not None and len(speaker_lang) == 1: if lang.upper() not in ["AUTO", "MIX"] and lang.lower() != speaker_lang[0]: logger.debug( f"lang \"{lang}\" is not in speaker_lang {speaker_lang},automatically set lang={speaker_lang[0]}") lang = speaker_lang[0] sentence_list = [] if lang.upper() != "MIX": if max <= 0: sentence_list.append( clasify_lang(text, speaker_lang) if lang.upper() == "AUTO" else f"[{lang.upper()}]{text}[{lang.upper()}]") else: for i in cut(text, max): if check_is_none(i): continue sentence_list.append( clasify_lang(i, speaker_lang) if lang.upper() == "AUTO" else f"[{lang.upper()}]{i}[{lang.upper()}]") else: sentence_list.append(text) for i in sentence_list: logger.debug(i) return sentence_list