Files
cli/sunbeam/checks.py
Sienna Meridian Satterwhite 1573faa0fd Add sunbeam check verb with service-level health probes
11 checks across 7 namespaces: gitea version+auth, postgres CNPG
readiness, valkey PONG, openbao sealed state, seaweedfs filer,
kratos health, hydra OIDC discovery, people HTTP (catches 502s),
people API, and livekit. Supports ns and ns/svc scoping.

- checks.py: new module with _http_get (no-redirect opener + mkcert SSL),
  kube_exec-based exec checks, and cmd_check dispatch
- kube.py: add kube_exec() and get_domain() (reads from cluster configmap)
- cli.py: add 'check [target]' verb
- 103 tests, all passing
2026-03-02 21:49:57 +00:00

270 lines
11 KiB
Python

"""Service-level health checks — functional probes beyond pod readiness."""
import base64
import json
import ssl
import subprocess
import urllib.error
import urllib.request
from dataclasses import dataclass
from pathlib import Path
from typing import Any
from sunbeam.kube import get_domain, kube_exec, kube_out, parse_target
from sunbeam.output import ok, step, warn
@dataclass
class CheckResult:
name: str
ns: str
svc: str
passed: bool
detail: str = ""
def _ssl_ctx() -> ssl.SSLContext:
"""Return an SSL context that trusts the mkcert local CA if available."""
ctx = ssl.create_default_context()
try:
r = subprocess.run(["mkcert", "-CAROOT"], capture_output=True, text=True)
if r.returncode == 0:
ca_file = Path(r.stdout.strip()) / "rootCA.pem"
if ca_file.exists():
ctx.load_verify_locations(cafile=str(ca_file))
except FileNotFoundError:
pass
return ctx
def _kube_secret(ns: str, name: str, key: str) -> str:
"""Read a base64-encoded K8s secret value and return the decoded string."""
raw = kube_out("get", "secret", name, "-n", ns, f"-o=jsonpath={{.data.{key}}}")
if not raw:
return ""
try:
return base64.b64decode(raw + "==").decode()
except Exception:
return ""
class _NoRedirect(urllib.request.HTTPRedirectHandler):
"""Prevent urllib from following redirects so we can inspect the status code."""
def redirect_request(self, req, fp, code, msg, headers, newurl):
return None
def _opener(ssl_ctx: ssl.SSLContext) -> urllib.request.OpenerDirector:
return urllib.request.build_opener(
_NoRedirect(),
urllib.request.HTTPSHandler(context=ssl_ctx),
)
def _http_get(url: str, opener: urllib.request.OpenerDirector, *,
headers: dict | None = None, timeout: int = 10) -> tuple[int, bytes]:
"""Return (status_code, body). Redirects are not followed."""
req = urllib.request.Request(url, headers=headers or {})
try:
with opener.open(req, timeout=timeout) as resp:
return resp.status, resp.read()
except urllib.error.HTTPError as e:
return e.code, b""
# ── Individual checks ─────────────────────────────────────────────────────────
def check_gitea_version(domain: str, opener) -> CheckResult:
"""GET /api/v1/version -> JSON with version field."""
url = f"https://src.{domain}/api/v1/version"
try:
status, body = _http_get(url, opener)
if status == 200:
ver = json.loads(body).get("version", "?")
return CheckResult("gitea-version", "devtools", "gitea", True, f"v{ver}")
return CheckResult("gitea-version", "devtools", "gitea", False, f"HTTP {status}")
except urllib.error.URLError as e:
return CheckResult("gitea-version", "devtools", "gitea", False, str(e.reason))
def check_gitea_auth(domain: str, opener) -> CheckResult:
"""GET /api/v1/user with admin credentials -> 200 and login field."""
username = _kube_secret("devtools", "gitea-admin-credentials", "admin-username") or "gitea_admin"
password = _kube_secret("devtools", "gitea-admin-credentials", "admin-password")
if not password:
return CheckResult("gitea-auth", "devtools", "gitea", False,
"admin-password not found in secret")
creds = base64.b64encode(f"{username}:{password}".encode()).decode()
url = f"https://src.{domain}/api/v1/user"
try:
status, body = _http_get(url, opener, headers={"Authorization": f"Basic {creds}"})
if status == 200:
login = json.loads(body).get("login", "?")
return CheckResult("gitea-auth", "devtools", "gitea", True, f"user={login}")
return CheckResult("gitea-auth", "devtools", "gitea", False, f"HTTP {status}")
except urllib.error.URLError as e:
return CheckResult("gitea-auth", "devtools", "gitea", False, str(e.reason))
def check_postgres(domain: str, opener) -> CheckResult:
"""CNPG Cluster readyInstances == instances."""
ready = kube_out("get", "cluster", "postgres", "-n", "data",
"-o=jsonpath={.status.readyInstances}")
total = kube_out("get", "cluster", "postgres", "-n", "data",
"-o=jsonpath={.status.instances}")
if ready and total and ready == total:
return CheckResult("postgres", "data", "postgres", True, f"{ready}/{total} ready")
detail = (f"{ready or '?'}/{total or '?'} ready"
if (ready or total) else "cluster not found")
return CheckResult("postgres", "data", "postgres", False, detail)
def check_valkey(domain: str, opener) -> CheckResult:
"""kubectl exec valkey pod -- valkey-cli ping -> PONG."""
pod = kube_out("get", "pods", "-n", "data", "-l", "app=valkey",
"--no-headers", "-o=custom-columns=NAME:.metadata.name")
pod = pod.splitlines()[0].strip() if pod else ""
if not pod:
return CheckResult("valkey", "data", "valkey", False, "no valkey pod")
_, out = kube_exec("data", pod, "valkey-cli", "ping")
return CheckResult("valkey", "data", "valkey", out == "PONG", out or "no response")
def check_openbao(domain: str, opener) -> CheckResult:
"""kubectl exec openbao-0 -- bao status -format=json -> initialized + unsealed."""
rc, out = kube_exec("data", "openbao-0", "bao", "status", "-format=json")
if not out:
return CheckResult("openbao", "data", "openbao", False, "no response")
try:
data = json.loads(out)
init = data.get("initialized", False)
sealed = data.get("sealed", True)
return CheckResult("openbao", "data", "openbao", init and not sealed,
f"init={init}, sealed={sealed}")
except json.JSONDecodeError:
return CheckResult("openbao", "data", "openbao", False, out[:80])
def check_seaweedfs(domain: str, opener) -> CheckResult:
"""kubectl exec seaweedfs-filer pod -- wget /dir/status -> filer responding."""
pod = kube_out("get", "pods", "-n", "storage", "-l", "app=seaweedfs-filer",
"--no-headers", "-o=custom-columns=NAME:.metadata.name")
pod = pod.splitlines()[0].strip() if pod else ""
if not pod:
return CheckResult("seaweedfs", "storage", "seaweedfs", False, "no seaweedfs-filer pod")
rc, out = kube_exec("storage", pod, "wget", "-qO-", "http://localhost:8888/dir/status")
if rc == 0 and out:
return CheckResult("seaweedfs", "storage", "seaweedfs", True, "filer responding")
return CheckResult("seaweedfs", "storage", "seaweedfs", False, "filer not responding")
def check_kratos(domain: str, opener) -> CheckResult:
"""GET /kratos/health/ready -> 200."""
url = f"https://auth.{domain}/kratos/health/ready"
try:
status, body = _http_get(url, opener)
ok_flag = status == 200
detail = f"HTTP {status}"
if not ok_flag and body:
detail += f": {body.decode(errors='replace')[:80]}"
return CheckResult("kratos", "ory", "kratos", ok_flag, detail)
except urllib.error.URLError as e:
return CheckResult("kratos", "ory", "kratos", False, str(e.reason))
def check_hydra_oidc(domain: str, opener) -> CheckResult:
"""GET /.well-known/openid-configuration -> 200 with issuer field."""
url = f"https://auth.{domain}/.well-known/openid-configuration"
try:
status, body = _http_get(url, opener)
if status == 200:
issuer = json.loads(body).get("issuer", "?")
return CheckResult("hydra-oidc", "ory", "hydra", True, f"issuer={issuer}")
return CheckResult("hydra-oidc", "ory", "hydra", False, f"HTTP {status}")
except urllib.error.URLError as e:
return CheckResult("hydra-oidc", "ory", "hydra", False, str(e.reason))
def check_people(domain: str, opener) -> CheckResult:
"""GET https://people.{domain}/ -> any response < 500 (302 to OIDC is fine)."""
url = f"https://people.{domain}/"
try:
status, _ = _http_get(url, opener)
return CheckResult("people", "lasuite", "people", status < 500, f"HTTP {status}")
except urllib.error.URLError as e:
return CheckResult("people", "lasuite", "people", False, str(e.reason))
def check_people_api(domain: str, opener) -> CheckResult:
"""GET /api/v1.0/config/ -> any response < 500 (401 auth-required is fine)."""
url = f"https://people.{domain}/api/v1.0/config/"
try:
status, _ = _http_get(url, opener)
return CheckResult("people-api", "lasuite", "people", status < 500, f"HTTP {status}")
except urllib.error.URLError as e:
return CheckResult("people-api", "lasuite", "people", False, str(e.reason))
def check_livekit(domain: str, opener) -> CheckResult:
"""kubectl exec livekit-server pod -- wget localhost:7880/ -> rc 0."""
pod = kube_out("get", "pods", "-n", "media", "-l", "app.kubernetes.io/name=livekit-server",
"--no-headers", "-o=custom-columns=NAME:.metadata.name")
pod = pod.splitlines()[0].strip() if pod else ""
if not pod:
return CheckResult("livekit", "media", "livekit", False, "no livekit pod")
rc, _ = kube_exec("media", pod, "wget", "-qO-", "http://localhost:7880/")
if rc == 0:
return CheckResult("livekit", "media", "livekit", True, "server responding")
return CheckResult("livekit", "media", "livekit", False, "server not responding")
# ── Check registry ────────────────────────────────────────────────────────────
CHECKS: list[tuple[Any, str, str]] = [
(check_gitea_version, "devtools", "gitea"),
(check_gitea_auth, "devtools", "gitea"),
(check_postgres, "data", "postgres"),
(check_valkey, "data", "valkey"),
(check_openbao, "data", "openbao"),
(check_seaweedfs, "storage", "seaweedfs"),
(check_kratos, "ory", "kratos"),
(check_hydra_oidc, "ory", "hydra"),
(check_people, "lasuite", "people"),
(check_people_api, "lasuite", "people"),
(check_livekit, "media", "livekit"),
]
def cmd_check(target: str | None) -> None:
"""Run service-level health checks, optionally scoped to a namespace or service."""
step("Service health checks...")
domain = get_domain()
ssl_ctx = _ssl_ctx()
op = _opener(ssl_ctx)
ns_filter, svc_filter = parse_target(target) if target else (None, None)
fns = [
fn for fn, ns, svc in CHECKS
if (ns_filter is None or ns == ns_filter)
and (svc_filter is None or svc == svc_filter)
]
if not fns:
warn(f"No checks match target: {target}")
return
results = []
for fn in fns:
r = fn(domain, op)
results.append(r)
icon = "\u2713" if r.passed else "\u2717"
detail = f" ({r.detail})" if r.detail else ""
print(f" {icon} {r.ns}/{r.svc} [{r.name}]{detail}")
print()
failed = [r for r in results if not r.passed]
if failed:
warn(f"{len(failed)} check(s) failed.")
else:
ok(f"All {len(results)} check(s) passed.")