Infrastructure Health Check Script
Infrastructure Health Check Script
The Feature
A Python script runs daily (via cron or GitHub Actions) and checks every infrastructure component against the thresholds defined in Chapter 14. It produces a plain-text summary: green (healthy), yellow (investigate), or red (act now) for each component. If any component is red, the developer receives a notification.
The Decision
A script, not a service. Not a monitoring tool with a database and a web UI. A script that runs, checks, prints results, and exits. It queries the same metrics that Grafana Cloud collects, but instead of watching a dashboard, the developer receives a daily summary. If everything is green, the summary takes two seconds to read and the developer moves on.
The Implementation
Health Check Script
#!/usr/bin/env python3
"""Daily infrastructure health check for Marketflow."""
import asyncio
import json
import os
import sys
import httpx
import asyncpg
# Thresholds (from CH14)
THRESHOLDS = {
"cpu_percent": {"yellow": 70, "red": 90},
"memory_percent": {"yellow": 80, "red": 90},
"disk_percent": {"yellow": 70, "red": 85},
"db_size_mb": {"yellow": 400, "red": 480}, # 500 MB free tier
"db_connections_percent": {"yellow": 80, "red": 95},
"response_time_p95_ms": {"yellow": 500, "red": 2000},
"error_rate_percent": {"yellow": 2, "red": 5},
"redis_memory_percent": {"yellow": 75, "red": 90},
}
def status_for(metric: str, value: float) -> str:
t = THRESHOLDS[metric]
if value >= t["red"]:
return "RED"
if value >= t["yellow"]:
return "YELLOW"
return "GREEN"
def status_icon(status: str) -> str:
return {"GREEN": "[OK]", "YELLOW": "[!!]", "RED": "[XX]"}[status]
async def check_system_metrics(api_url: str) -> list[dict]:
"""Query Prometheus metrics from the API."""
results = []
async with httpx.AsyncClient(timeout=10) as client:
try:
resp = await client.get(f"{api_url}/metrics")
metrics_text = resp.text
# Parse the health endpoint for system info
health_resp = await client.get(f"{api_url}/health")
health = health_resp.json()
results.append({
"component": "API Health",
"status": "GREEN" if health["status"] == "healthy" else "RED",
"value": health["status"],
})
except Exception as e:
results.append({
"component": "API Health",
"status": "RED",
"value": f"Unreachable: {e}",
})
return results
async def check_database(database_url: str) -> list[dict]:
"""Check database size and connection count."""
results = []
try:
conn = await asyncpg.connect(database_url)
# Database size
size_bytes = await conn.fetchval(
"SELECT pg_database_size(current_database())"
)
size_mb = size_bytes / (1024 * 1024)
s = status_for("db_size_mb", size_mb)
results.append({
"component": "Database Size",
"status": s,
"value": f"{size_mb:.0f} MB / 500 MB",
})
# Connection count
active = await conn.fetchval(
"SELECT count(*) FROM pg_stat_activity"
)
max_conn = await conn.fetchval(
"SELECT setting::int FROM pg_settings WHERE name = 'max_connections'"
)
conn_pct = (active / max_conn) * 100
s = status_for("db_connections_percent", conn_pct)
results.append({
"component": "DB Connections",
"status": s,
"value": f"{active}/{max_conn} ({conn_pct:.0f}%)",
})
await conn.close()
except Exception as e:
results.append({
"component": "Database",
"status": "RED",
"value": f"Connection failed: {e}",
})
return results
async def check_redis(redis_url: str) -> list[dict]:
"""Check Redis memory usage."""
results = []
try:
import redis.asyncio as aioredis
r = aioredis.from_url(redis_url)
info = await r.info("memory")
used_mb = info["used_memory"] / (1024 * 1024)
max_mb = info.get("maxmemory", 0) / (1024 * 1024)
if max_mb > 0:
pct = (used_mb / max_mb) * 100
s = status_for("redis_memory_percent", pct)
results.append({
"component": "Redis Memory",
"status": s,
"value": f"{used_mb:.0f} MB / {max_mb:.0f} MB ({pct:.0f}%)",
})
else:
results.append({
"component": "Redis Memory",
"status": "GREEN",
"value": f"{used_mb:.0f} MB (no limit set)",
})
await r.aclose()
except Exception as e:
results.append({
"component": "Redis",
"status": "RED",
"value": f"Connection failed: {e}",
})
return results
def print_report(results: list[dict]) -> bool:
"""Print the health check report. Returns True if any RED."""
print("=" * 60)
print(" MARKETFLOW INFRASTRUCTURE HEALTH CHECK")
print("=" * 60)
has_red = False
for r in results:
icon = status_icon(r["status"])
print(f" {icon} {r['component']:.<30} {r['value']}")
if r["status"] == "RED":
has_red = True
print("=" * 60)
if has_red:
print(" STATUS: ACTION REQUIRED")
elif any(r["status"] == "YELLOW" for r in results):
print(" STATUS: INVESTIGATE")
else:
print(" STATUS: ALL HEALTHY")
print("=" * 60)
return has_red
async def main():
api_url = os.environ.get("API_URL", "http://localhost:8000")
database_url = os.environ.get("DATABASE_URL", "")
redis_url = os.environ.get("REDIS_URL", "redis://localhost:6379")
results = []
results.extend(await check_system_metrics(api_url))
if database_url:
results.extend(await check_database(database_url))
results.extend(await check_redis(redis_url))
has_red = print_report(results)
sys.exit(1 if has_red else 0)
if __name__ == "__main__":
asyncio.run(main())
Scheduled Execution with GitHub Actions
# .github/workflows/health-check.yml
name: Daily Infrastructure Health Check
on:
schedule:
- cron: "0 8 * * *" # 8 AM UTC daily
workflow_dispatch: # Allow manual trigger
jobs:
health-check:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: "3.12"
- name: Install dependencies
run: pip install httpx asyncpg redis
- name: Run health check
env:
API_URL: ${{ secrets.PRODUCTION_API_URL }}
DATABASE_URL: ${{ secrets.PRODUCTION_DATABASE_URL }}
REDIS_URL: ${{ secrets.PRODUCTION_REDIS_URL }}
run: python scripts/health_check.py
# If the health check exits with code 1 (RED status),
# the GitHub Actions step fails and sends a notification
# via GitHub's built-in email notifications for failed workflows.
Alternative: Cron on the VPS
# On the Hetzner VPS, add to crontab:
# crontab -e
0 8 * * * cd /opt/marketflow && python3 scripts/health_check.py >> /var/log/marketflow/health.log 2>&1
The Trap
# TRAP: The health check script that checks everything and alerts on nothing
# It logs 50 metrics to a file that nobody reads
# The file grows to 10 MB and fills the disk
# The very thing the script should detect (disk usage) is caused by the script
# SAFE: Check only what matters, alert only on actionable thresholds
# 6 components, 2 thresholds each
# Non-zero exit code triggers existing notification (GitHub Actions, cron mail)
# No persistent log storage needed
A health check script is only useful if someone acts on its output. Non-zero exit codes integrate with existing notification systems. GitHub Actions sends an email when a workflow fails. Cron sends an email when a command exits with an error. No additional notification infrastructure needed.
The Cost
| Item | Cost |
|---|---|
| Health check script | $0 |
| GitHub Actions (scheduled) | Free for public repos, 2,000 min/month for private |
| Cron on VPS | $0 |
The script runs in under 10 seconds. GitHub Actions free tier provides 2,000 minutes per month for private repositories. A daily 10-second health check uses approximately 5 minutes per month.