llm / services /health_check.py
Chris4K's picture
Create health_check.py
02c1ae0 verified
raw
history blame
981 Bytes
# health_check.py
import psutil
from dataclasses import dataclass
from typing import Dict, Any
@dataclass
class HealthStatus:
status: str
gpu_memory: Dict[str, float]
cpu_usage: float
ram_usage: float
model_status: Dict[str, str]
class HealthCheck:
@staticmethod
def check_gpu_memory() -> Dict[str, float]:
if torch.cuda.is_available():
return {
f"gpu_{i}": torch.cuda.memory_allocated(i) / 1024**3
for i in range(torch.cuda.device_count())
}
return {}
@staticmethod
def check_system_resources() -> HealthStatus:
return HealthStatus(
status="healthy",
gpu_memory=HealthCheck.check_gpu_memory(),
cpu_usage=psutil.cpu_percent(),
ram_usage=psutil.virtual_memory().percent,
#TODO add more system resources like disk, network, etc.
model_status={} # To be filled by the model manager
)