smcleod commited on
Commit
f14551e
·
verified ·
1 Parent(s): ea21fc1

Update convert_tokenizer.py

Browse files
Files changed (1) hide show
  1. convert_tokenizer.py +29 -5
convert_tokenizer.py CHANGED
@@ -1,4 +1,4 @@
1
- from transformers import PreTrainedTokenizerFast, AutoTokenizer
2
  import json
3
  import os
4
  import shutil
@@ -34,8 +34,7 @@ def convert_phi_tokenizer(input_dir, output_dir):
34
  'tokenizer.json',
35
  'tokenizer_config.json',
36
  'special_tokens_map.json',
37
- 'vocab.json',
38
- 'added_tokens.json'
39
  ]
40
 
41
  # Files to copy directly (no JSON parsing)
@@ -77,7 +76,7 @@ def convert_phi_tokenizer(input_dir, output_dir):
77
  'add_prefix_space': False,
78
  'clean_up_tokenization_spaces': False,
79
  'model_max_length': 16384,
80
- 'tokenizer_class': 'GPT2Tokenizer',
81
  'bos_token': '<|endoftext|>',
82
  'eos_token': '<|endoftext|>',
83
  'pad_token': '<|endoftext|>'
@@ -89,6 +88,31 @@ def convert_phi_tokenizer(input_dir, output_dir):
89
  json.dump(config, f, indent=2)
90
  print("Successfully updated config")
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  print("\nAttempting to test tokenizer...")
93
  try:
94
  tokenizer = AutoTokenizer.from_pretrained(output_dir)
@@ -99,7 +123,7 @@ def convert_phi_tokenizer(input_dir, output_dir):
99
  print(f"Test text: {test_text}")
100
  print(f"Encoded: {tokens}")
101
  print(f"Decoded: {decoded}")
102
-
103
  # check if they're the same
104
  if test_text != decoded:
105
  print("Decoded text does not match original text!")
 
1
+ from transformers import AutoTokenizer
2
  import json
3
  import os
4
  import shutil
 
34
  'tokenizer.json',
35
  'tokenizer_config.json',
36
  'special_tokens_map.json',
37
+ 'added_tokens.json' # Moved added_tokens.json here
 
38
  ]
39
 
40
  # Files to copy directly (no JSON parsing)
 
76
  'add_prefix_space': False,
77
  'clean_up_tokenization_spaces': False,
78
  'model_max_length': 16384,
79
+ 'tokenizer_class': 'GPT2Tokenizer', # Changed to GPT2Tokenizer
80
  'bos_token': '<|endoftext|>',
81
  'eos_token': '<|endoftext|>',
82
  'pad_token': '<|endoftext|>'
 
88
  json.dump(config, f, indent=2)
89
  print("Successfully updated config")
90
 
91
+ # Construct the vocabulary with added tokens
92
+ print("\nConstructing vocabulary...")
93
+ tokenizer_path = os.path.join(output_dir, "tokenizer.json")
94
+ tokenizer_data = safe_read_json(tokenizer_path)
95
+ if tokenizer_data is None:
96
+ print("Error: Unable to read tokenizer.json")
97
+ return
98
+
99
+ vocab = tokenizer_data["model"]["vocab"]
100
+ added_tokens = tokenizer_data.get("added_tokens", [])
101
+
102
+ for token_data in added_tokens:
103
+ content = token_data["content"]
104
+ if content not in vocab:
105
+ vocab[content] = token_data["id"]
106
+
107
+ vocab_size = len(vocab)
108
+ print(f"Vocabulary size: {vocab_size}")
109
+
110
+ # Save the vocabulary as vocab.json
111
+ vocab_output_path = os.path.join(output_dir, "vocab.json")
112
+ with open(vocab_output_path, 'w', encoding='utf-8') as f:
113
+ json.dump(vocab, f, indent=2)
114
+ print(f"Successfully saved vocabulary to {vocab_output_path}")
115
+
116
  print("\nAttempting to test tokenizer...")
117
  try:
118
  tokenizer = AutoTokenizer.from_pretrained(output_dir)
 
123
  print(f"Test text: {test_text}")
124
  print(f"Encoded: {tokens}")
125
  print(f"Decoded: {decoded}")
126
+
127
  # check if they're the same
128
  if test_text != decoded:
129
  print("Decoded text does not match original text!")