fix documentation for loading the model, since the fused attention module doesnt work here either.
Browse files
README.md
CHANGED
@@ -122,6 +122,7 @@ tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
|
|
122 |
|
123 |
model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
|
124 |
model_basename=model_basename,
|
|
|
125 |
use_safetensors=True,
|
126 |
trust_remote_code=False,
|
127 |
device="cuda:0",
|
|
|
122 |
|
123 |
model = AutoGPTQForCausalLM.from_quantized(model_name_or_path,
|
124 |
model_basename=model_basename,
|
125 |
+
inject_fused_attention=False, # Required for TheBloke/FreeWilly2-GPTQ model at this time.
|
126 |
use_safetensors=True,
|
127 |
trust_remote_code=False,
|
128 |
device="cuda:0",
|