dnagpt
/

dnagpt2

Model card Files Files and versions Metrics Training metrics Community

dnagpt2 / 04-gene-sft /build_gene_bpe_seg.py

marisming's picture

Upload folder using huggingface_hub

1df7ad4 verified about 1 month ago

history blame contribute delete

704 Bytes

	import sentencepiece as spm

	spm.SentencePieceTrainer.train(input='../01-data_env/data/dna_1g.txt,../01-data_env/data/protein_1g.txt',
	model_prefix='gene_bpe_seg',
	vocab_size=60000,
	model_type='bpe', #默认是unigram
	num_threads=10,
	)

	from sentencepiece import SentencePieceProcessor
	model_path = "gene_bpe_seg.model"
	sp_model = SentencePieceProcessor(model_file=model_path)
	mm = sp_model.EncodeAsPieces("TCGACGGCACGCGACAGCAGCGAGCCCCGCGCACCCGAGCGCGAKCGFVGPMVHLKVHLEADVASSCRSAVIYLTSEEPFEGVLGLRLKEGIAITGCWPRWPDEMDERSAVWRVEPYTRHFGRVLYSFGV")
	print(mm)