sunbeam check: parallel execution, 5s timeout, external S3 check

All checks now run concurrently via ThreadPoolExecutor so total time
is bounded by the slowest single check, not their sum.

Timeout reduced from 10s to 5s per check. SeaweedFS check switched
from kubectl exec (wget not reliably available in container) to an
HTTP probe against the external S3 endpoint (https://s3.DOMAIN/) —
consistent with the "use external URLs for publicly facing services"
requirement. 403 is treated as healthy (unauthenticated S3 response).
This commit is contained in:
2026-03-02 21:57:33 +00:00
parent 39a2f70c3b
commit 6bd59abd74
2 changed files with 41 additions and 31 deletions

View File

@@ -5,6 +5,7 @@ import ssl
import subprocess import subprocess
import urllib.error import urllib.error
import urllib.request import urllib.request
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import Any from typing import Any
@@ -61,7 +62,7 @@ def _opener(ssl_ctx: ssl.SSLContext) -> urllib.request.OpenerDirector:
def _http_get(url: str, opener: urllib.request.OpenerDirector, *, def _http_get(url: str, opener: urllib.request.OpenerDirector, *,
headers: dict | None = None, timeout: int = 10) -> tuple[int, bytes]: headers: dict | None = None, timeout: int = 5) -> tuple[int, bytes]:
"""Return (status_code, body). Redirects are not followed. """Return (status_code, body). Redirects are not followed.
Any network/SSL error (including TimeoutError) is re-raised as URLError Any network/SSL error (including TimeoutError) is re-raised as URLError
@@ -155,16 +156,14 @@ def check_openbao(domain: str, opener) -> CheckResult:
def check_seaweedfs(domain: str, opener) -> CheckResult: def check_seaweedfs(domain: str, opener) -> CheckResult:
"""kubectl exec seaweedfs-filer pod -- wget /dir/status -> filer responding.""" """GET https://s3.{domain}/ -> any response from the S3 API (< 500)."""
pod = kube_out("get", "pods", "-n", "storage", "-l", "app=seaweedfs-filer", url = f"https://s3.{domain}/"
"--no-headers", "-o=custom-columns=NAME:.metadata.name") try:
pod = pod.splitlines()[0].strip() if pod else "" status, _ = _http_get(url, opener)
if not pod: # Unauthenticated S3 returns 403 (expected); 200 also ok; 5xx = problem.
return CheckResult("seaweedfs", "storage", "seaweedfs", False, "no seaweedfs-filer pod") return CheckResult("seaweedfs", "storage", "seaweedfs", status < 500, f"HTTP {status}")
rc, out = kube_exec("storage", pod, "wget", "-qO-", "http://localhost:8888/dir/status") except urllib.error.URLError as e:
if rc == 0 and out: return CheckResult("seaweedfs", "storage", "seaweedfs", False, str(e.reason))
return CheckResult("seaweedfs", "storage", "seaweedfs", True, "filer responding")
return CheckResult("seaweedfs", "storage", "seaweedfs", False, "filer not responding")
def check_kratos(domain: str, opener) -> CheckResult: def check_kratos(domain: str, opener) -> CheckResult:
@@ -244,6 +243,13 @@ CHECKS: list[tuple[Any, str, str]] = [
] ]
def _run_one(fn, domain: str, op, ns: str, svc: str) -> CheckResult:
try:
return fn(domain, op)
except Exception as e:
return CheckResult(fn.__name__.replace("check_", ""), ns, svc, False, str(e)[:80])
def cmd_check(target: str | None) -> None: def cmd_check(target: str | None) -> None:
"""Run service-level health checks, optionally scoped to a namespace or service.""" """Run service-level health checks, optionally scoped to a namespace or service."""
step("Service health checks...") step("Service health checks...")
@@ -263,14 +269,11 @@ def cmd_check(target: str | None) -> None:
warn(f"No checks match target: {target}") warn(f"No checks match target: {target}")
return return
# Run all checks; catch any unexpected exception so we never crash. # Run all checks concurrently — total time ≈ slowest single check.
results = [] with ThreadPoolExecutor(max_workers=len(selected)) as pool:
for fn, ns, svc in selected: futures = [pool.submit(_run_one, fn, domain, op, ns, svc)
try: for fn, ns, svc in selected]
r = fn(domain, op) results = [f.result() for f in futures]
except Exception as e:
r = CheckResult(fn.__name__.replace("check_", ""), ns, svc, False, str(e)[:80])
results.append(r)
# Print grouped by namespace (mirrors sunbeam status layout). # Print grouped by namespace (mirrors sunbeam status layout).
name_w = max(len(r.name) for r in results) name_w = max(len(r.name) for r in results)

View File

@@ -126,22 +126,29 @@ class TestCheckOpenbao(unittest.TestCase):
class TestCheckSeaweedfs(unittest.TestCase): class TestCheckSeaweedfs(unittest.TestCase):
def test_responding_passes(self): def test_200_passes(self):
with patch("sunbeam.checks.kube_out", return_value="seaweedfs-filer-abc"): with patch("sunbeam.checks._http_get", return_value=(200, b"")):
with patch("sunbeam.checks.kube_exec", return_value=(0, "filer status data")):
from sunbeam import checks from sunbeam import checks
r = checks.check_seaweedfs("testdomain", None) r = checks.check_seaweedfs("testdomain", None)
self.assertTrue(r.passed) self.assertTrue(r.passed)
def test_no_pod_fails(self): def test_403_unauthenticated_passes(self):
with patch("sunbeam.checks.kube_out", return_value=""): # S3 returns 403 for unauthenticated requests — that means it's up.
with patch("sunbeam.checks._http_get", return_value=(403, b"")):
from sunbeam import checks
r = checks.check_seaweedfs("testdomain", None)
self.assertTrue(r.passed)
def test_502_fails(self):
with patch("sunbeam.checks._http_get", return_value=(502, b"")):
from sunbeam import checks from sunbeam import checks
r = checks.check_seaweedfs("testdomain", None) r = checks.check_seaweedfs("testdomain", None)
self.assertFalse(r.passed) self.assertFalse(r.passed)
def test_exec_fails(self): def test_connection_error_fails(self):
with patch("sunbeam.checks.kube_out", return_value="seaweedfs-filer-abc"): import urllib.error
with patch("sunbeam.checks.kube_exec", return_value=(1, "")): with patch("sunbeam.checks._http_get",
side_effect=urllib.error.URLError("refused")):
from sunbeam import checks from sunbeam import checks
r = checks.check_seaweedfs("testdomain", None) r = checks.check_seaweedfs("testdomain", None)
self.assertFalse(r.passed) self.assertFalse(r.passed)