How can I do inference with this model?

#5
by Grambel - opened

I followed all the steps and I now have the transformed weights. How do I use these weights to run inference with a quantized model on GPU?

I tried model = LlamaForCausalLM.from_pretrained('<path_to_weights>/oasst-sft-6-llama-30b/', load_in_8bit=True, device_map='auto'), but this gives the following error:

ValueError: 
    Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit
    the quantized model. If you want to dispatch the model on the CPU or the disk while keeping
    these modules in 32-bit, you need to set `load_in_8bit_fp32_cpu_offload=True` and pass a custom
    `device_map` to `from_pretrained`. Check
    https://huggingface.co./docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu
    for more details.

Any suggestions on how to run this on a 48 GB GPU?

I faced this error with vicuna-13b model. I saw that upgrading accelerate to 0.18.0 would resolve this error.

accelerate is already on 0.18.0.

has anyone found the solution? I'm facing the same issue.

Following is the inference code.
I am importing the train.py from Stanford alpaca repo.

My Env:

absl-py==1.4.0
accelerate==0.18.0
aiohttp==3.8.4
aiosignal==1.3.1
appdirs==1.4.4
async-timeout==4.0.2
attrs==23.1.0
bitsandbytes==0.38.1
certifi==2022.12.7
charset-normalizer==3.1.0
click==8.1.3
cmake==3.26.3
docker-pycreds==0.4.0
filelock==3.12.0
fire==0.5.0
frozenlist==1.3.3
fsspec==2023.4.0
gitdb==4.0.10
GitPython==3.1.31
huggingface-hub==0.14.1
idna==3.4
Jinja2==3.1.2
joblib==1.2.0
lit==16.0.2
MarkupSafe==2.1.2
mpmath==1.3.0
multidict==6.0.4
networkx==3.1
nltk==3.8.1
numpy==1.24.3
nvidia-cublas-cu11==11.10.3.66
nvidia-cuda-cupti-cu11==11.7.101
nvidia-cuda-nvrtc-cu11==11.7.99
nvidia-cuda-runtime-cu11==11.7.99
nvidia-cudnn-cu11==8.5.0.96
nvidia-cufft-cu11==10.9.0.58
nvidia-curand-cu11==10.2.10.91
nvidia-cusolver-cu11==11.4.0.1
nvidia-cusparse-cu11==11.7.4.91
nvidia-nccl-cu11==2.14.3
nvidia-nvtx-cu11==11.7.91
openai==0.27.4
packaging==23.1
pathtools==0.1.2
protobuf==4.22.3
psutil==5.9.5
PyYAML==6.0
regex==2023.3.23
requests==2.28.2
rouge-score==0.1.2
sentencepiece==0.1.98
sentry-sdk==1.21.0
setproctitle==1.3.2
six==1.16.0
smmap==5.0.0
sympy==1.11.1
termcolor==2.3.0
tokenizers==0.13.3
torch==2.0.0
tqdm==4.65.0
transformers==4.28.1
triton==2.0.0
typing_extensions==4.5.0
urllib3==1.26.15
wandb==0.15.0
yarl==1.9.1

from dataclasses import dataclass, field

import numpy as np
import torch
import transformers
from transformers import GenerationConfig

from train import ModelArguments, smart_tokenizer_and_embedding_resize, DEFAULT_PAD_TOKEN, DEFAULT_EOS_TOKEN,
DEFAULT_BOS_TOKEN, DEFAULT_UNK_TOKEN, PROMPT_DICT

@dataclass
class InferenceArguments:
model_max_length: int = field(
default=512,
metadata={"help": "Maximum sequence length. Sequences will be right padded (and possibly truncated)."},
)
load_in_8bit: bool = field(
default=False,
metadata={"help": "Load the model in 8-bit mode."},
)
inference_dtype: torch.dtype = field(
default=torch.float32,
metadata={"help": "The dtype to use for inference."},
)

def generate_prompt(instruction, input=None):
if input:
return PROMPT_DICT["prompt_input"].format(instruction=instruction, input=input)
else:
return PROMPT_DICT["prompt_no_input"].format(instruction=instruction)

def inference():
parser = transformers.HfArgumentParser((ModelArguments, InferenceArguments))
model_args, inference_args = parser.parse_args_into_dataclasses()

model = transformers.AutoModelForCausalLM.from_pretrained(
"./oasst-sft-6-llama-30b",
load_in_8bit=inference_args.load_in_8bit,
torch_dtype=inference_args.inference_dtype,
device_map="auto",
)
model.cpu()
model.eval()

generation_config = GenerationConfig(
temperature=0.1,
top_p=0.75,
num_beams=4,
)

tokenizer = transformers.AutoTokenizer.from_pretrained(
"./oasst-sft-6-llama-30b",
use_fast=False,
model_max_length=inference_args.model_max_length,
)

if tokenizer.pad_token is None:
smart_tokenizer_and_embedding_resize(
special_tokens_dict=dict(pad_token=DEFAULT_PAD_TOKEN),
tokenizer=tokenizer,
model=model,
)
tokenizer.add_special_tokens(
{
"eos_token": DEFAULT_EOS_TOKEN,
"bos_token": DEFAULT_BOS_TOKEN,
"unk_token": DEFAULT_UNK_TOKEN,
}
)

ctx = ""
for instruction in ["Whats your objective?","Who are you?","Write me a poem about birds?"]:
print("Instruction:", instruction)
inputs = tokenizer(generate_prompt(instruction, None), return_tensors="pt")
outputs = model.generate(input_ids=inputs["input_ids"].cpu(),
generation_config=generation_config,
max_new_tokens=inference_args.model_max_length,
return_dict_in_generate=True,
output_scores=True)
input_length = 1 if model.config.is_encoder_decoder else inputs.input_ids.shape[1]
generated_tokens = outputs.sequences[:, input_length:]

ctx += f"Instruction: {instruction}\n" + f"Response: {generated_tokens[0]}\n"
print("Response:", tokenizer.decode(generated_tokens[0]))
print()

if name == "main":
inference()

you'll just have to offload to cpu or disk, because the model is too big.
This works for me:

model = transformers.LlamaForCausalLM.from_pretrained(PATH, local_files_only=True, torch_dtype=torch.float16, device_map='auto', low_cpu_mem_usage=True, offload_folder='models_hf')

Sign up or log in to comment