Chris4K commited on
Commit
02c1ae0
·
verified ·
1 Parent(s): 03f35b0

Create health_check.py

Browse files
Files changed (1) hide show
  1. services/health_check.py +33 -0
services/health_check.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # health_check.py
2
+ import psutil
3
+ from dataclasses import dataclass
4
+ from typing import Dict, Any
5
+
6
+ @dataclass
7
+ class HealthStatus:
8
+ status: str
9
+ gpu_memory: Dict[str, float]
10
+ cpu_usage: float
11
+ ram_usage: float
12
+ model_status: Dict[str, str]
13
+
14
+ class HealthCheck:
15
+ @staticmethod
16
+ def check_gpu_memory() -> Dict[str, float]:
17
+ if torch.cuda.is_available():
18
+ return {
19
+ f"gpu_{i}": torch.cuda.memory_allocated(i) / 1024**3
20
+ for i in range(torch.cuda.device_count())
21
+ }
22
+ return {}
23
+
24
+ @staticmethod
25
+ def check_system_resources() -> HealthStatus:
26
+ return HealthStatus(
27
+ status="healthy",
28
+ gpu_memory=HealthCheck.check_gpu_memory(),
29
+ cpu_usage=psutil.cpu_percent(),
30
+ ram_usage=psutil.virtual_memory().percent,
31
+ #TODO add more system resources like disk, network, etc.
32
+ model_status={} # To be filled by the model manager
33
+ )