Update README.md
Browse files
README.md
CHANGED
@@ -45,15 +45,22 @@ To run the inference on top of Llama 3.1 405B Instruct AWQ in INT4 precision, th
|
|
45 |
|
46 |
```python
|
47 |
import torch
|
48 |
-
from transformers import AutoModelForCausalLM, AutoTokenizer
|
49 |
|
50 |
model_id = "hugging-quants/Meta-Llama-3.1-405B-Instruct-AWQ-INT4"
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
52 |
model = AutoModelForCausalLM.from_pretrained(
|
53 |
model_id,
|
54 |
torch_dtype=torch.float16,
|
55 |
low_cpu_mem_usage=True,
|
56 |
device_map="auto",
|
|
|
57 |
)
|
58 |
|
59 |
prompt = [
|
|
|
45 |
|
46 |
```python
|
47 |
import torch
|
48 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer, AwqConfig
|
49 |
|
50 |
model_id = "hugging-quants/Meta-Llama-3.1-405B-Instruct-AWQ-INT4"
|
51 |
+
quantization_config = AwqConfig(
|
52 |
+
bits=4,
|
53 |
+
fuse_max_seq_len=512, # Note: Update this as per your use-case
|
54 |
+
do_fuse=True,
|
55 |
+
)
|
56 |
+
|
57 |
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
58 |
model = AutoModelForCausalLM.from_pretrained(
|
59 |
model_id,
|
60 |
torch_dtype=torch.float16,
|
61 |
low_cpu_mem_usage=True,
|
62 |
device_map="auto",
|
63 |
+
quantization_config=quantization_config
|
64 |
)
|
65 |
|
66 |
prompt = [
|