Imran1 commited on
Commit
3c5ff26
1 Parent(s): a3fcb16

Create serve.py

Browse files
Files changed (1) hide show
  1. code/serve.py +22 -0
code/serve.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+
4
+ def run_vllm_inference():
5
+ # Set the necessary environment variables
6
+ os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
7
+
8
+ # vLLM serve command
9
+ command = [
10
+ "vllm", "serve", "Imran1/Qwen2.5-72B-Instruct-FP8",
11
+ "--tensor-parallel-size", "4",
12
+ "--dtype", "auto",
13
+ "--api-key", "token-abc123",
14
+ "--max-model-len", "2000",
15
+ "--kv-cache-dtype", "auto"
16
+ ]
17
+
18
+ # Run the command as a subprocess
19
+ subprocess.run(command)
20
+
21
+ if __name__ == "__main__":
22
+ run_vllm_inference()