colab t4

#3
by sdyy - opened

it run with
from transformers import pipeline

messages = [
{"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model="ModelCloud/gemma-2-9b-it-gptq-4bit")
pipe(messages)

not run with

import os

Gemma-2 use Flashinfer backend for models with logits_soft_cap. Otherwise, the output might be wrong.

os.environ['VLLM_ATTENTION_BACKEND'] = 'FLASHINFER'

from transformers import AutoTokenizer
from gptqmodel import BACKEND, GPTQModel

model_name = "ModelCloud/gemma-2-27b-it-gptq-4bit"

prompt = [{"role": "user", "content": "I am in Shanghai, preparing to visit the natural history museum. Can you tell me the best way to"}]

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = GPTQModel.from_quantized(
model_name,
backend=BACKEND.VLLM,
)

inputs = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
outputs = model.generate(prompts=inputs, temperature=0.95, max_length=128)
print(outputs[0].outputs[0].text)

import os

Gemma-2 use Flashinfer backend for models with logits_soft_cap. Otherwise, the output might be wrong.

os.environ['VLLM_ATTENTION_BACKEND'] = 'FLASHINFER'

from transformers import AutoTokenizer
from gptqmodel import BACKEND, GPTQModel

model_name = "ModelCloud/gemma-2-9b-it-gptq-4bit"

prompt = [{"role": "user", "content": "I am in Shanghai, preparing to visit the natural history museum. Can you tell me the best way to"}]

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = GPTQModel.from_quantized(
model_name,
backend=BACKEND.VLLM,
)

inputs = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
outputs = model.generate(prompts=inputs, temperature=0.95, max_length=128)
print(outputs[0].outputs[0].text)

/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret HF_TOKEN does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co./settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
/usr/local/lib/python3.10/dist-packages/huggingface_hub/file_download.py:797: FutureWarning: resume_download is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use force_download=True.
warnings.warn(
INFO - Ignoring unknown parameter in the quantization configuration: model_name_or_path.
INFO - Ignoring unknown parameter in the quantization configuration: model_file_base_name.
WARNING 12-10 23:57:32 config.py:200] Gemma 2 uses sliding window attention for every odd layer, which is currently not supported by vLLM. Disabling sliding window and capping the max length to the sliding window size (4096).
WARNING 12-10 23:57:32 config.py:319] gptq quantization is not fully optimized yet. The speed can be slower than non-quantized models.
INFO 12-10 23:57:32 llm_engine.py:234] Initializing an LLM engine (v0.6.3.dev28+g33f460b1.d20240927) with config: model='ModelCloud/gemma-2-9b-it-gptq-4bit', speculative_config=None, tokenizer='ModelCloud/gemma-2-9b-it-gptq-4bit', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=gptq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=ModelCloud/gemma-2-9b-it-gptq-4bit, use_v2_block_manager=False, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs=False, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=False, mm_processor_kwargs=None)
INFO 12-10 23:57:35 selector.py:142] Using Flashinfer backend.

ValueError Traceback (most recent call last)
in <cell line: 14>()
12 tokenizer = AutoTokenizer.from_pretrained(model_name)
13
---> 14 model = GPTQModel.from_quantized(
15 model_name,
16 backend=BACKEND.VLLM,

9 frames
/usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py in _check_if_gpu_supports_dtype(torch_dtype)
466 compute_str = f"has compute capability {version_str}"
467
--> 468 raise ValueError(
469 "Bfloat16 is only supported on GPUs with compute capability "
470 f"of at least 8.0. Your {gpu_name} GPU {compute_str}. "

ValueError: Bfloat16 is only supported on GPUs with compute capability of at least 8.0. Your Tesla T4 GPU has compute capability 7.5. You can use float16 instead by explicitly setting thedtype flag in CLI, for example: --dtype=half.

from transformers import pipeline

messages = [
{"role": "user", "content": "Who are you?"},
]
pipe = pipeline("text-generation", model="ModelCloud/gemma-2-9b-it-gptq-4bit")
pipe(messages)

/usr/local/lib/python3.10/dist-packages/auto_gptq/nn_modules/triton_utils/kernels.py:411: FutureWarning: torch.cuda.amp.custom_fwd(args...) is deprecated. Please use torch.amp.custom_fwd(args..., device_type='cuda') instead.
def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
/usr/local/lib/python3.10/dist-packages/auto_gptq/nn_modules/triton_utils/kernels.py:419: FutureWarning: torch.cuda.amp.custom_bwd(args...) is deprecated. Please use torch.amp.custom_bwd(args..., device_type='cuda') instead.
def backward(ctx, grad_output):
/usr/local/lib/python3.10/dist-packages/auto_gptq/nn_modules/triton_utils/kernels.py:461: FutureWarning: torch.cuda.amp.custom_fwd(args...) is deprecated. Please use torch.amp.custom_fwd(args..., device_type='cuda') instead.
@custom_fwd(cast_inputs=torch.float16)
WARNING:auto_gptq.nn_modules.qlinear.qlinear_cuda:CUDA extension not installed.
WARNING:auto_gptq.nn_modules.qlinear.qlinear_cuda_old:CUDA extension not installed.
low_cpu_mem_usage was None, now default to True since model is quantized.
/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:5055: FutureWarning: _is_quantized_training_enabled is going to be deprecated in transformers 4.39.0. Please use model.hf_quantizer.is_trainable instead
warnings.warn(
loss_type=None was set in the config but it is unrecognised.Using the default loss: ForCausalLMLoss.
WARNING:optimum.gptq.quantizer:Found modules on cpu/disk. Using Exllama/Exllamav2 backend requires all the modules to be on GPU. Setting disable_exllama=True
Device set to use cuda:0
The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.
[{'generated_text': [{'role': 'user', 'content': 'Who are you?'},
{'role': 'assistant',
'content': 'I am Gemma, an open-weights AI assistant. I am a large language model trained by Google'}]}]

it is run with

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig

Define GPTQ configuration

gptq_config = GPTQConfig(bits=2, use_exllama=True, use_cuda_fp16=True)

Load the quantized model

model = AutoModelForCausalLM.from_pretrained(
"ModelCloud/gemma-2-9b-gptq-4bit",
device_map="auto",
quantization_config=gptq_config
)

Load the tokenizer # This line was missing

tokenizer = AutoTokenizer.from_pretrained("ModelCloud/gemma-2-9b-gptq-4bit")

Generate and print the output

print(tokenizer.decode(model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(model.device))[0]))

[ ]
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig

Define GPTQ configuration

gptq_config = GPTQConfig(bits=2, use_exllama=True, use_cuda_fp16=True)

Load the quantized model

model = AutoModelForCausalLM.from_pretrained(
"ModelCloud/gemma-2-9b-gptq-4bit",
device_map="auto",
quantization_config=gptq_config
)

Load the tokenizer # This line was missing

tokenizer = AutoTokenizer.from_pretrained("ModelCloud/gemma-2-9b-gptq-4bit")

Generate and print the output

print(tokenizer.decode(model.generate(**tokenizer("gptqmodel is", return_tensors="pt").to(model.device))[0]))
/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_auth.py:94: UserWarning:
The secret HF_TOKEN does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co./settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
warnings.warn(
/usr/local/lib/python3.10/dist-packages/transformers/quantizers/auto.py:186: UserWarning: You passed quantization_config or equivalent parameters to from_pretrained but the model you're loading already has a quantization_config attribute. The quantization_config from the model will be used.However, loading attributes (e.g. ['use_cuda_fp16', 'use_exllama', 'max_input_length', 'exllama_config', 'disable_exllama']) will be overwritten with the one you passed to from_pretrained. The rest will be ignored.
warnings.warn(warning_msg)
/usr/local/lib/python3.10/dist-packages/auto_gptq/nn_modules/triton_utils/kernels.py:411: FutureWarning: torch.cuda.amp.custom_fwd(args...) is deprecated. Please use torch.amp.custom_fwd(args..., device_type='cuda') instead.
def forward(ctx, input, qweight, scales, qzeros, g_idx, bits, maxq):
/usr/local/lib/python3.10/dist-packages/auto_gptq/nn_modules/triton_utils/kernels.py:419: FutureWarning: torch.cuda.amp.custom_bwd(args...) is deprecated. Please use torch.amp.custom_bwd(args..., device_type='cuda') instead.
def backward(ctx, grad_output):
/usr/local/lib/python3.10/dist-packages/auto_gptq/nn_modules/triton_utils/kernels.py:461: FutureWarning: torch.cuda.amp.custom_fwd(args...) is deprecated. Please use torch.amp.custom_fwd(args..., device_type='cuda') instead.
@custom_fwd(cast_inputs=torch.float16)
WARNING:auto_gptq.nn_modules.qlinear.qlinear_cuda:CUDA extension not installed.
WARNING:auto_gptq.nn_modules.qlinear.qlinear_cuda_old:CUDA extension not installed.
/usr/local/lib/python3.10/dist-packages/transformers/modeling_utils.py:5055: FutureWarning: _is_quantized_training_enabled is going to be deprecated in transformers 4.39.0. Please use model.hf_quantizer.is_trainable instead
warnings.warn(
loss_type=None was set in the config but it is unrecognised.Using the default loss: ForCausalLMLoss.
The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.
gptqmodel is a Python library. gptqmodel has no vulnerabilities, it has a Permissive License and it

run with

import torch
from transformers import AutoModelForCausalLM, GPTQConfig, AutoTokenizer
gptq_config = GPTQConfig(bits=2, use_exllama=True, use_cuda_fp16=True)
model = AutoModelForCausalLM.from_pretrained("ModelCloud/gemma-2-9b-gptq-4bit", device_map="auto", quantization_config=gptq_config)
tokenizer = AutoTokenizer.from_pretrained("ModelCloud/gemma-2-9b-gptq-4bit")
print(tokenizer.decode(model.generate(**tokenizer("ai is", return_tensors="pt").to(model.device))[0]))

The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.
ai is a very good tool for the future. It can help us to do many things. For example,

!pip install flashinfer -i https://flashinfer.ai/whl/cu121/torch2.3

Sign up or log in to comment