vllm call for multiple GPUs, just fyi.
#16
by
silvacarl
- opened
from vllm import LLM
from vllm.sampling_params import SamplingParams
model_name = "mistralai/Mistral-Small-Instruct-2409"
sampling_params = SamplingParams(max_tokens=8192)
note that running Mistral-Small on a single GPU requires at least 44 GB of GPU RAM
If you want to divide the GPU requirement over multiple devices, please add e.g. tensor_parallel=2
llm = LLM(model=model_name, tokenizer_mode="mistral", config_format="mistral", load_format="mistral", tensor_parallel_size=2)
prompt = "How often does the letter r occur in Mistral?"
messages = [
{
"role": "user",
"content": prompt
},
]
outputs = llm.chat(messages, sampling_params=sampling_params)
print(outputs[0].outputs[0].text)
also: vllm serve mistralai/Mistral-Small-Instruct-2409 --tokenizer_mode mistral --config_format mistral --load_format mistral --tensor-parallel-size 2