feat: add health check endpoint and related schemas with tests

This commit is contained in:
Jokob @NetAlertX
2026-02-17 23:01:49 +00:00
parent 9ac8f6fe34
commit 264cae3338
6 changed files with 451 additions and 1 deletions

View File

@@ -41,6 +41,7 @@ from .nettools_endpoint import ( # noqa: E402 [flake8 lint suppression]
from .dbquery_endpoint import read_query, write_query, update_query, delete_query # noqa: E402 [flake8 lint suppression]
from .sync_endpoint import handle_sync_post, handle_sync_get # noqa: E402 [flake8 lint suppression]
from .logs_endpoint import clean_log # noqa: E402 [flake8 lint suppression]
from .health_endpoint import get_health_status # noqa: E402 [flake8 lint suppression]
from models.user_events_queue_instance import UserEventsQueueInstance # noqa: E402 [flake8 lint suppression]
from models.event_instance import EventInstance # noqa: E402 [flake8 lint suppression]
@@ -86,6 +87,7 @@ from .openapi.schemas import ( # noqa: E402 [flake8 lint suppression]
RecentEventsResponse, LastEventsResponse,
NetworkTopologyResponse,
InternetInfoResponse, NetworkInterfacesResponse,
HealthCheckResponse,
CreateEventRequest, CreateSessionRequest,
DeleteSessionRequest, CreateNotificationRequest,
SyncPushRequest, SyncPullResponse,
@@ -1930,6 +1932,33 @@ def check_auth(payload=None):
if request.method == "GET":
return jsonify({"success": True, "message": "Authentication check successful"}), 200
# --------------------------
# Health endpoint
# --------------------------
@app.route("/health", methods=["GET"])
@validate_request(
operation_id="check_health",
summary="System Health Check",
description="Retrieve system vitality metrics including database size, memory pressure, system load, disk usage, and CPU temperature.",
response_model=HealthCheckResponse,
tags=["system", "health"],
auth_callable=is_authorized
)
def check_health(payload=None):
"""Get system health metrics for monitoring and diagnostics."""
try:
health_data = get_health_status()
return jsonify({"success": True, **health_data}), 200
except Exception as e:
mylog("none", [f"[health] Error retrieving health status: {e}"])
return jsonify({
"success": False,
"error": "Failed to retrieve health status",
"message": str(e)
}), 500
# --------------------------
# Background Server Start
# --------------------------

View File

@@ -0,0 +1,147 @@
"""Health check endpoint for NetAlertX system vitality monitoring."""
import os
import psutil
from pathlib import Path
from const import dbPath, dataPath
from logger import mylog
# ===============================================================================
# Database Vitality
# ===============================================================================
def get_db_size_mb():
"""
Calculate total database size in MB (app.db + app.db-wal).
Returns:
float: Size in MB, or 0 if database files don't exist.
"""
try:
db_file = Path(dbPath)
wal_file = Path(f"{dbPath}-wal")
size_bytes = 0
if db_file.exists():
size_bytes += db_file.stat().st_size
if wal_file.exists():
size_bytes += wal_file.stat().st_size
return round(size_bytes / (1024 * 1024), 2)
except Exception as e:
mylog("verbose", [f"[health] Error calculating DB size: {e}"])
return 0.0
# ===============================================================================
# Memory Pressure
# ===============================================================================
def get_mem_usage_pct():
"""
Calculate memory usage percentage (used / total * 100).
Returns:
int: Memory usage as integer percentage (0-100), or -1 on error.
"""
try:
vm = psutil.virtual_memory()
pct = int((vm.used / vm.total) * 100)
return max(0, min(100, pct)) # Clamp to 0-100
except Exception as e:
mylog("verbose", [f"[health] Error calculating memory usage: {e}"])
return -1
# ===============================================================================
# System Stress
# ===============================================================================
def get_load_avg_1m():
"""
Get 1-minute load average.
Returns:
float: 1-minute load average, or -1 on error.
"""
try:
load_1m, _, _ = os.getloadavg()
return round(load_1m, 2)
except Exception as e:
mylog("verbose", [f"[health] Error getting load average: {e}"])
return -1.0
# ===============================================================================
# Disk Headroom
# ===============================================================================
def get_storage_pct():
"""
Calculate disk usage percentage of /data mount.
Returns:
int: Disk usage as integer percentage (0-100), or -1 on error.
"""
try:
stat = os.statvfs(dataPath)
total = stat.f_blocks * stat.f_frsize
used = (stat.f_blocks - stat.f_bfree) * stat.f_frsize
pct = int((used / total) * 100) if total > 0 else 0
return max(0, min(100, pct)) # Clamp to 0-100
except Exception as e:
mylog("verbose", [f"[health] Error calculating storage usage: {e}"])
return -1
# ===============================================================================
# Thermal Health
# ===============================================================================
def get_cpu_temp():
"""
Get CPU temperature from hardware sensors if available.
Returns:
int: CPU temperature in Celsius, or None if unavailable.
"""
try:
temps = psutil.sensors_temperatures()
if not temps:
return None
# Prefer 'coretemp' (Intel), fallback to first available
if "coretemp" in temps and temps["coretemp"]:
return int(temps["coretemp"][0].current)
# Fallback to first sensor with data
for sensor_type, readings in temps.items():
if readings:
return int(readings[0].current)
return None
except Exception as e:
mylog("verbose", [f"[health] Error reading CPU temperature: {e}"])
return None
# ===============================================================================
# Aggregator
# ===============================================================================
def get_health_status():
"""
Collect all health metrics into a single dict.
Returns:
dict: Dictionary with all health metrics.
"""
return {
"db_size_mb": get_db_size_mb(),
"mem_usage_pct": get_mem_usage_pct(),
"load_1m": get_load_avg_1m(),
"storage_pct": get_storage_pct(),
"cpu_temp": get_cpu_temp(),
}

View File

@@ -651,6 +651,34 @@ class NetworkInterfacesResponse(BaseResponse):
interfaces: Dict[str, Any] = Field(..., description="Details about network interfaces.")
# =============================================================================
# HEALTH CHECK SCHEMAS
# =============================================================================
class HealthCheckResponse(BaseResponse):
"""System health check with vitality metrics."""
model_config = ConfigDict(
extra="allow",
json_schema_extra={
"examples": [{
"success": True,
"db_size_mb": 125.45,
"mem_usage_pct": 65,
"load_1m": 2.15,
"storage_pct": 42,
"cpu_temp": 58
}]
}
)
db_size_mb: float = Field(..., description="Database size in MB (app.db + app.db-wal)")
mem_usage_pct: int = Field(..., ge=0, le=100, description="Memory usage percentage (0-100)")
load_1m: float = Field(..., description="1-minute load average")
storage_pct: int = Field(..., ge=0, le=100, description="Disk usage percentage of /data mount (0-100)")
cpu_temp: Optional[int] = Field(None, description="CPU temperature in Celsius (nullable if unavailable)")
# =============================================================================
# EVENTS SCHEMAS
# =============================================================================