import sentencepiece as spm spm.SentencePieceTrainer.train(input='../01-data_env/data/dna_1g.txt,../01-data_env/data/protein_1g.txt', model_prefix='gene_bpe_seg', vocab_size=60000, model_type='bpe', #默认是unigram num_threads=10, ) from sentencepiece import SentencePieceProcessor model_path = "gene_bpe_seg.model" sp_model = SentencePieceProcessor(model_file=model_path) mm = sp_model.EncodeAsPieces("TCGACGGCACGCGACAGCAGCGAGCCCCGCGCACCCGAGCGCGAKCGFVGPMVHLKVHLEADVASSCRSAVIYLTSEEPFEGVLGLRLKEGIAITGCWPRWPDEMDERSAVWRVEPYTRHFGRVLYSFGV") print(mm)