woodchen7 commited on
Commit
113da9b
1 Parent(s): 3ed5f0b

Upload test4consistent.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. test4consistent.py +19 -0
test4consistent.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # test tokenizer encode & decode consistency
3
+ from transformers import AutoTokenizer
4
+ tokenizer = AutoTokenizer.from_pretrained('/apdcephfs/share_1502809/shaneshu/tokenizer_exp/other_tokenizer_vocab/hy', local_files_only=True, trust_remote_code=True)
5
+
6
+ test_data = [line.strip() for line in open('/apdcephfs/share_1502809/shaneshu/tokenizer_exp/data/test.txt', 'r').readlines()]
7
+
8
+ num_origi_len = 0
9
+ num_token_len = 0
10
+
11
+ for d in test_data:
12
+ a = tokenizer.encode(d)
13
+ num_origi_len += len(d)
14
+ num_token_len += len(a)
15
+ b = tokenizer.decode(a)
16
+ assert b == d, f"encode & decode not consistent: {d} vs {b}"
17
+
18
+ print(f" original length: {num_origi_len}")
19
+ print(f" token length: {num_token_len}")