codefuse-admin commited on
Commit
cffbce0
1 Parent(s): d38f5d1

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +19 -25
README.md CHANGED
@@ -31,7 +31,7 @@ After undergoing 4-bit quantization, the CodeFuse-CodeLlama-34B-4bits model can
31
 
32
  🔥🔥🔥 2023-09-26 We are pleased to announce the release of the 4-bit quantized version of CodeFuse-CodeLlama-34B. Despite the quantization process, the model still achieves a remarkable 73.8% accuracy (greedy decoding) on the HumanEval pass@1 metric.
33
 
34
- 🔥🔥🔥 2023-09-11 CodeFuse-CodeLlama34B has achived 74.4% of pass@1 (greedy decoding) on HumanEval, which is SOTA results for openspurced LLMs at present.
35
 
36
  <br>
37
 
@@ -124,24 +124,22 @@ pip install -r requirements.txt
124
  import os
125
  import torch
126
  import time
127
- from modelscope import AutoTokenizer, snapshot_download
128
- from auto_gptq import AutoGPTQForCausalLM
129
 
130
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
131
 
132
- def load_model_tokenizer(model_path):
133
  """
134
- Load model and tokenizer based on the given model name or local path of downloaded model.
135
  """
136
- tokenizer = AutoTokenizer.from_pretrained(model_path,
137
  trust_remote_code=True,
138
  use_fast=False,
139
  lagecy=False)
140
  tokenizer.padding_side = "left"
141
- tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("<unk>")
142
- tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("</s>")
143
 
144
- model = AutoGPTQForCausalLM.from_quantized(model_path,
145
  inject_fused_attention=False,
146
  inject_fused_mlp=False,
147
  use_cuda_fp16=True,
@@ -153,7 +151,7 @@ def load_model_tokenizer(model_path):
153
 
154
  def inference(model, tokenizer, prompt):
155
  """
156
- Uset the given model and tokenizer to generate an answer for the speicifed prompt.
157
  """
158
  st = time.time()
159
  prompt = prompt if prompt.endswith('\n') else f'{prompt}\n'
@@ -181,11 +179,10 @@ def inference(model, tokenizer, prompt):
181
 
182
 
183
  if __name__ == "__main__":
184
- model_dir = snapshot_download('codefuse-ai/CodeFuse-CodeLlama-34B-4bits', revision='v1.0.0')
185
-
186
  prompt = 'Please write a QuickSort program in Python'
187
 
188
- model, tokenizer = load_model_tokenizer(model_dir)
189
  inference(model, tokenizer, prompt)
190
  ```
191
 
@@ -319,29 +316,27 @@ pip install -r requirements.txt
319
  import os
320
  import torch
321
  import time
322
- from modelscope import AutoTokenizer, snapshot_download
323
- from auto_gptq import AutoGPTQForCausalLM
324
 
325
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
326
 
327
- def load_model_tokenizer(model_path):
328
  """
329
  Load model and tokenizer based on the given model name or local path of downloaded model.
330
  """
331
- tokenizer = AutoTokenizer.from_pretrained(model_path,
332
  trust_remote_code=True,
333
  use_fast=False,
334
  lagecy=False)
335
  tokenizer.padding_side = "left"
336
- tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids("<unk>")
337
- tokenizer.eos_token_id = tokenizer.convert_tokens_to_ids("</s>")
338
 
339
- model = AutoGPTQForCausalLM.from_quantized(model_path,
340
  inject_fused_attention=False,
341
  inject_fused_mlp=False,
342
  use_cuda_fp16=True,
343
  disable_exllama=False,
344
- device_map='auto' # 支持多卡
345
  )
346
  return model, tokenizer
347
 
@@ -366,7 +361,7 @@ def inference(model, tokenizer, prompt):
366
  do_sample=True,
367
  max_new_tokens=512,
368
  eos_token_id=tokenizer.eos_token_id,
369
- pad_token_id=tokenizer.pad_token_id
370
  )
371
  print(f'generated tokens num is {len(generated_ids[0][input_ids.size(1):])}')
372
  outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
@@ -376,11 +371,10 @@ def inference(model, tokenizer, prompt):
376
 
377
 
378
  if __name__ == "__main__":
379
- model_dir = snapshot_download('codefuse-ai/CodeFuse-CodeLlama-34B-4bits', revision='v1.0.0')
380
-
381
  prompt = '请用Python实现一个快速排序算法'
382
 
383
- model, tokenizer = load_model_tokenizer(model_dir)
384
  inference(model, tokenizer, prompt)
385
  ```
386
 
 
31
 
32
  🔥🔥🔥 2023-09-26 We are pleased to announce the release of the 4-bit quantized version of CodeFuse-CodeLlama-34B. Despite the quantization process, the model still achieves a remarkable 73.8% accuracy (greedy decoding) on the HumanEval pass@1 metric.
33
 
34
+ 🔥🔥🔥 2023-09-11 CodeFuse-CodeLlama34B has achieved 74.4% of pass@1 (greedy decoding) on HumanEval, which is SOTA results for openspurced LLMs at present.
35
 
36
  <br>
37
 
 
124
  import os
125
  import torch
126
  import time
127
+ from transformers import AutoTokenizer
128
+ from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
129
 
130
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
131
 
132
+ def load_model_tokenizer(model_name_or_local_path):
133
  """
134
+ Load model and tokenizer based on the given model name or local path of the downloaded model.
135
  """
136
+ tokenizer = AutoTokenizer.from_pretrained(model_name_or_local_path,
137
  trust_remote_code=True,
138
  use_fast=False,
139
  lagecy=False)
140
  tokenizer.padding_side = "left"
 
 
141
 
142
+ model = AutoGPTQForCausalLM.from_quantized(model_name_or_local_path,
143
  inject_fused_attention=False,
144
  inject_fused_mlp=False,
145
  use_cuda_fp16=True,
 
151
 
152
  def inference(model, tokenizer, prompt):
153
  """
154
+ Uset the given model and tokenizer to generate an answer for the specified prompt.
155
  """
156
  st = time.time()
157
  prompt = prompt if prompt.endswith('\n') else f'{prompt}\n'
 
179
 
180
 
181
  if __name__ == "__main__":
182
+ model_name_or_local_path = '<Mole name (i.e. codefuse-ai/CodeFuse-CodeLlama-34B-4bits) or local path of the downloaded model>'
 
183
  prompt = 'Please write a QuickSort program in Python'
184
 
185
+ model, tokenizer = load_model_tokenizer(model_name_or_local_path)
186
  inference(model, tokenizer, prompt)
187
  ```
188
 
 
316
  import os
317
  import torch
318
  import time
319
+ from transformers import AutoTokenizer
320
+ from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
321
 
322
  os.environ["TOKENIZERS_PARALLELISM"] = "false"
323
 
324
+ def load_model_tokenizer(model_name_or_local_path):
325
  """
326
  Load model and tokenizer based on the given model name or local path of downloaded model.
327
  """
328
+ tokenizer = AutoTokenizer.from_pretrained(model_name_or_local_path,
329
  trust_remote_code=True,
330
  use_fast=False,
331
  lagecy=False)
332
  tokenizer.padding_side = "left"
 
 
333
 
334
+ model = AutoGPTQForCausalLM.from_quantized(model_name_or_local_path,
335
  inject_fused_attention=False,
336
  inject_fused_mlp=False,
337
  use_cuda_fp16=True,
338
  disable_exllama=False,
339
+ device_map='auto' # Support multi-gpus
340
  )
341
  return model, tokenizer
342
 
 
361
  do_sample=True,
362
  max_new_tokens=512,
363
  eos_token_id=tokenizer.eos_token_id,
364
+ pad_token_id=tokenizer.pad_token_id
365
  )
366
  print(f'generated tokens num is {len(generated_ids[0][input_ids.size(1):])}')
367
  outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 
371
 
372
 
373
  if __name__ == "__main__":
374
+ model_name_or_local_path = '<模型名字 (codefuse-ai/CodeFuse-CodeLlama-34B-4bits)或者提前下载到本地的模型路径>'
 
375
  prompt = '请用Python实现一个快速排序算法'
376
 
377
+ model, tokenizer = load_model_tokenizer(model_name_or_local_path)
378
  inference(model, tokenizer, prompt)
379
  ```
380