xu-song commited on
Commit
a37f943
·
1 Parent(s): 0415b36
Files changed (2) hide show
  1. examples.py +2 -2
  2. util.py +1 -1
examples.py CHANGED
@@ -1,6 +1,6 @@
1
  examples = {
2
  "en": [
3
- ["spaces: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm_6b"], # chatglm 有blank_n,
4
  # !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
5
  ["punctuations: ,.:/?+=\",。!?;【】〔〕〖〗", "baichuan", "llama"],
6
  ["symbols: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
@@ -8,7 +8,7 @@ examples = {
8
  ]
9
  ,
10
  "zh": [
11
- ["空格测试: 2个空格 8个空格", "llama", "chatglm_6b"], # chatglm 有blank_n,
12
  ["标点测试:,。!?;", "baichuan_7b", "llama"],
13
  ["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
14
  ["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
 
1
  examples = {
2
  "en": [
3
+ ["spaces: 2spaces 8spaces\t1tab\t\t2tab\n1newline", "llama", "chatglm2_6b"], # chatglm 有blank_n,
4
  # !?。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.
5
  ["punctuations: ,.:/?+=\",。!?;【】〔〕〖〗", "baichuan", "llama"],
6
  ["symbols: 🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan", "llama"],
 
8
  ]
9
  ,
10
  "zh": [
11
+ ["空格测试: 2个空格 8个空格", "llama", "chatglm2_6b"], # chatglm 有blank_n,
12
  ["标点测试:,。!?;", "baichuan_7b", "llama"],
13
  ["符号测试:🦙❤❥웃유♋☮✊☏☢☚✔☑♚▢♪✈✞÷↑↓▤▥⊙■□▣▽¿─│♥❣▬▫☿Ⓐ ✋✉☣☤", "baichuan_7b", "llama"],
14
  ["数字测试:(10086 + 98) = 100184", "baichuan_7b", "llama"],
util.py CHANGED
@@ -31,7 +31,7 @@ def tokenize(text, tokenizer_type, color_num=5):
31
  token_str = token.decode("utf-8")
32
  except:
33
  token_str = token.decode("utf-8", errors="ignore")
34
- logger.error("decode_error: " + json.dumps(
35
  {"tokenizer_type": tokenizer_type, "token": str(token), "token_str": token_str},
36
  ensure_ascii=False))
37
 
 
31
  token_str = token.decode("utf-8")
32
  except:
33
  token_str = token.decode("utf-8", errors="ignore")
34
+ logger.error("decode_error: " + json.dumps( # gpt_35_turbo 经常有token会decode error,这里用来记录一下
35
  {"tokenizer_type": tokenizer_type, "token": str(token), "token_str": token_str},
36
  ensure_ascii=False))
37