alvarobartt HF staff commited on
Commit
283c881
1 Parent(s): 6e5043b

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +14 -11
README.md CHANGED
@@ -60,13 +60,6 @@ prompt = [
60
  ]
61
 
62
  tokenizer = AutoTokenizer.from_pretrained(model_id)
63
- tokenizer.pad_token_id = tokenizer.eos_token_id
64
- tokenizer.padding_side = "left"
65
-
66
- terminators = [
67
- tokenizer.eos_token_id,
68
- tokenizer.convert_tokens_to_ids("<|eot_id|>"),
69
- ]
70
 
71
  inputs = tokenizer.apply_chat_template(prompt, tokenize=True, add_generation_prompt=True, return_tensors="pt").cuda()
72
 
@@ -77,12 +70,24 @@ model = AutoModelForCausalLM.from_pretrained(
77
  device_map="auto",
78
  )
79
 
80
- outputs = model.generate(**inputs, do_sample=True, max_new_tokens=256, eos_token_id=terminators)
81
  print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
82
  ```
83
 
84
  ### AutoAWQ
85
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  Alternatively, one may want to run that via `AutoAWQ` even though it's built on top of 🤗 `transformers`, which is the recommended approach instead as described above.
87
 
88
  ```python
@@ -97,8 +102,6 @@ prompt = [
97
  ]
98
 
99
  tokenizer = AutoTokenizer.from_pretrained(model_id)
100
- tokenizer.pad_token_id = tokenizer.eos_token_id
101
- tokenizer.padding_side = "left"
102
 
103
  inputs = tokenizer.apply_chat_template(prompt, tokenize=True, add_generation_prompt=True, return_tensors="pt").cuda()
104
 
@@ -110,7 +113,7 @@ model = AutoAWQForCausalLM.from_pretrained(
110
  fuse_layers=True,
111
  )
112
 
113
- outputs = model.generate(**inputs, do_sample=True, max_new_tokens=256)
114
  print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
115
  ```
116
 
 
60
  ]
61
 
62
  tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 
 
 
 
 
 
63
 
64
  inputs = tokenizer.apply_chat_template(prompt, tokenize=True, add_generation_prompt=True, return_tensors="pt").cuda()
65
 
 
70
  device_map="auto",
71
  )
72
 
73
+ outputs = model.generate(inputs, do_sample=True, max_new_tokens=256)
74
  print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
75
  ```
76
 
77
  ### AutoAWQ
78
 
79
+ In order to run the inference with Llama 3.1 405B Instruct AWQ in INT4, both `torch` and `autoawq` need to be installed as:
80
+
81
+ ```bash
82
+ pip install "torch>=2.2.0,<2.3.0" autoawq --upgrade
83
+ ```
84
+
85
+ Then, the latest version of `transformers` need to be installed, being 4.43.0 or higher, as:
86
+
87
+ ```bash
88
+ pip install "transformers[accelerate]>=4.43.0" --upgrade
89
+ ```
90
+
91
  Alternatively, one may want to run that via `AutoAWQ` even though it's built on top of 🤗 `transformers`, which is the recommended approach instead as described above.
92
 
93
  ```python
 
102
  ]
103
 
104
  tokenizer = AutoTokenizer.from_pretrained(model_id)
 
 
105
 
106
  inputs = tokenizer.apply_chat_template(prompt, tokenize=True, add_generation_prompt=True, return_tensors="pt").cuda()
107
 
 
113
  fuse_layers=True,
114
  )
115
 
116
+ outputs = model.generate(inputs, do_sample=True, max_new_tokens=256)
117
  print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
118
  ```
119