dnagpt2 / 04-gene-sft /build_gene_bpe_seg.py
marisming's picture
Upload folder using huggingface_hub
1df7ad4 verified
raw
history blame contribute delete
704 Bytes
import sentencepiece as spm
spm.SentencePieceTrainer.train(input='../01-data_env/data/dna_1g.txt,../01-data_env/data/protein_1g.txt',
model_prefix='gene_bpe_seg',
vocab_size=60000,
model_type='bpe', #默认是unigram
num_threads=10,
)
from sentencepiece import SentencePieceProcessor
model_path = "gene_bpe_seg.model"
sp_model = SentencePieceProcessor(model_file=model_path)
mm = sp_model.EncodeAsPieces("TCGACGGCACGCGACAGCAGCGAGCCCCGCGCACCCGAGCGCGAKCGFVGPMVHLKVHLEADVASSCRSAVIYLTSEEPFEGVLGLRLKEGIAITGCWPRWPDEMDERSAVWRVEPYTRHFGRVLYSFGV")
print(mm)