how to enable low memory version
#3
by
aarunsoman
- opened
model = transformers.AutoModelForCausalLM.from_pretrained(
'h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3',
trust_remote_code=True,
load_8bit=True,
torch_dtype=bfloat16,
)
doesn't work
import torch
from h2oai_pipeline import H2OTextGenerationPipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
quantization_config = None
# optional quantization
quantization_config = BitsAndBytesConfig(
load_in_8bit=True,
llm_int8_threshold=6.0,
)
tokenizer = AutoTokenizer.from_pretrained(
"h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3",
use_fast=False,
padding_side="left",
trust_remote_code=True,
)
model = AutoModelForCausalLM.from_pretrained(
"h2oai/h2ogpt-gm-oasst1-en-2048-falcon-7b-v3",
trust_remote_code=True,
torch_dtype=torch.float16,
device_map={"": "cuda:0"},
quantization_config=quantization_config
).eval()
generate_text = H2OTextGenerationPipeline(model=model, tokenizer=tokenizer)
res = generate_text(
"Why is drinking water so healthy?",
min_new_tokens=2,
max_new_tokens=1024,
do_sample=False,
num_beams=1,
temperature=float(0.3),
repetition_penalty=float(1.2),
renormalize_logits=True
)
print(res[0]["generated_text"])
Something like this should work, but this model has been trained in float16, quantization will likely lead to degraded results.
ilu000
changed discussion status to
closed