import os import subprocess def run_vllm_inference(): # Set the necessary environment variables os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1" # vLLM serve command command = [ "vllm", "serve", "Imran1/Qwen2.5-72B-Instruct-FP8", "--tensor-parallel-size", "4", "--dtype", "auto", "--api-key", "token-abc123", "--max-model-len", "2000", "--kv-cache-dtype", "auto" ] # Run the command as a subprocess subprocess.run(command) if __name__ == "__main__": run_vllm_inference()