Imran1's picture
Create serve.py
3c5ff26 verified
raw
history blame
552 Bytes
import os
import subprocess
def run_vllm_inference():
# Set the necessary environment variables
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
# vLLM serve command
command = [
"vllm", "serve", "Imran1/Qwen2.5-72B-Instruct-FP8",
"--tensor-parallel-size", "4",
"--dtype", "auto",
"--api-key", "token-abc123",
"--max-model-len", "2000",
"--kv-cache-dtype", "auto"
]
# Run the command as a subprocess
subprocess.run(command)
if __name__ == "__main__":
run_vllm_inference()