feat(infra): production bootstrap — cert-manager, longhorn, monitoring

Add new bases for cert-manager (Let's Encrypt + wildcard cert), Longhorn distributed storage, and monitoring (kube-prometheus-stack + Loki + Tempo + Grafana OIDC). Add cloud-init for Scaleway Elastic Metal provisioning. Production overlay: add patches for postgres sizing, SeaweedFS volume, OpenSearch storage, LiveKit service, Pingora host ports, resource limits, and CNPG daily barman backups. Update cert-manager.yaml with full dnsNames for all *.sunbeam.pt subdomains.
2026-03-06 12:06:27 +00:00
parent f7774558e9
commit 7ff35d3e0c
23 changed files with 855 additions and 35 deletions
--- a/base/cert-manager/kustomization.yaml
+++ b/base/cert-manager/kustomization.yaml
@@ -0,0 +1,15 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - namespace.yaml
+
+helmCharts:
+  # helm repo add jetstack https://charts.jetstack.io
+  - name: cert-manager
+    repo: https://charts.jetstack.io
+    version: "1.19.4"
+    releaseName: cert-manager
+    namespace: cert-manager
+    valuesFile: values.yaml
+    includeCRDs: true
--- a/base/cert-manager/namespace.yaml
+++ b/base/cert-manager/namespace.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: cert-manager
--- a/base/cert-manager/values.yaml
+++ b/base/cert-manager/values.yaml
@@ -0,0 +1,2 @@
+crds:
+  enabled: true
--- a/base/longhorn/kustomization.yaml
+++ b/base/longhorn/kustomization.yaml
@@ -0,0 +1,13 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - namespace.yaml
+
+helmCharts:
+  - name: longhorn
+    repo: https://charts.longhorn.io
+    version: "1.11.0"
+    releaseName: longhorn
+    namespace: longhorn-system
+    valuesFile: values.yaml
--- a/base/longhorn/namespace.yaml
+++ b/base/longhorn/namespace.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: longhorn-system
--- a/base/longhorn/values.yaml
+++ b/base/longhorn/values.yaml
@@ -0,0 +1,24 @@
+# Longhorn distributed block storage
+# Single-node production deployment — RAID1 hardware already provides redundancy.
+
+defaultSettings:
+  # 1 replica: RAID1 mirrors the disk, so software replication adds no benefit
+  # on a single node and would halve available capacity.
+  defaultReplicaCount: 1
+
+  # Orphan auto-deletion: clean up node/instance orphaned resources automatically
+  orphanAutoDeletion: true
+
+  # Allow volumes to be scheduled on the only available node even when disk
+  # pressure is detected (single-node: no other node to reschedule to).
+  allowVolumeCreationWithDegradedAvailability: true
+
+  # Reduce reserved percentage to 10% — RAID1 hardware provides physical redundancy,
+  # so Longhorn doesn't need to hold back 30% for software replicas.
+  # With 937 GiB disk: 843 GiB schedulable (600 SW + 100 PG + ~143 headroom).
+  storageReservedPercentageForDefaultDisk: 10
+
+# Set Longhorn as the default StorageClass.
+persistence:
+  defaultClass: true
+  defaultClassReplicaCount: 1
--- a/base/monitoring/grafana-oauth2client.yaml
+++ b/base/monitoring/grafana-oauth2client.yaml
@@ -0,0 +1,32 @@
+# Hydra OAuth2Client for Grafana OIDC sign-in.
+#
+# Hydra Maester watches this CRD and:
+#   1. Registers the client with Hydra
+#   2. Creates K8s Secret "grafana-oidc" in monitoring namespace
+#      with CLIENT_ID and CLIENT_SECRET keys.
+#
+# Grafana picks up the secret via envFromSecret and interpolates
+# ${CLIENT_ID} / ${CLIENT_SECRET} in grafana.ini at startup.
+#
+# DOMAIN_SUFFIX is substituted by sunbeam apply.
+---
+apiVersion: hydra.ory.sh/v1alpha1
+kind: OAuth2Client
+metadata:
+  name: grafana
+  namespace: monitoring
+spec:
+  clientName: Grafana
+  grantTypes:
+    - authorization_code
+    - refresh_token
+  responseTypes:
+    - code
+  scope: openid email profile
+  redirectUris:
+    - https://grafana.DOMAIN_SUFFIX/login/generic_oauth
+  postLogoutRedirectUris:
+    - https://grafana.DOMAIN_SUFFIX/
+  tokenEndpointAuthMethod: client_secret_post
+  secretName: grafana-oidc
+  skipConsent: true
--- a/base/monitoring/kustomization.yaml
+++ b/base/monitoring/kustomization.yaml
@@ -0,0 +1,34 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: monitoring
+
+resources:
+  - namespace.yaml
+  - vault-secrets.yaml
+  - grafana-oauth2client.yaml
+
+helmCharts:
+  # helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+  - name: kube-prometheus-stack
+    repo: https://prometheus-community.github.io/helm-charts
+    version: "82.9.0"
+    releaseName: kube-prometheus-stack
+    namespace: monitoring
+    valuesFile: prometheus-values.yaml
+    includeCRDs: true
+
+  # helm repo add grafana https://grafana.github.io/helm-charts
+  - name: loki
+    repo: https://grafana.github.io/helm-charts
+    version: "6.53.0"
+    releaseName: loki
+    namespace: monitoring
+    valuesFile: loki-values.yaml
+
+  - name: tempo
+    repo: https://grafana.github.io/helm-charts
+    version: "1.24.4"
+    releaseName: tempo
+    namespace: monitoring
+    valuesFile: tempo-values.yaml
--- a/base/monitoring/loki-values.yaml
+++ b/base/monitoring/loki-values.yaml
@@ -0,0 +1,43 @@
+# Loki — monolithic single-binary mode, filesystem storage, single tenant.
+deploymentMode: SingleBinary
+
+loki:
+  auth_enabled: false
+  commonConfig:
+    replication_factor: 1
+  storage:
+    type: filesystem
+  schemaConfig:
+    configs:
+      - from: "2024-01-01"
+        store: tsdb
+        object_store: filesystem
+        schema: v13
+        index:
+          prefix: index_
+          period: 24h
+
+singleBinary:
+  replicas: 1
+  persistence:
+    enabled: true
+    size: 30Gi
+
+# Disable sub-charts/probes not needed for single-node
+backend:
+  replicas: 0
+read:
+  replicas: 0
+write:
+  replicas: 0
+
+monitoring:
+  selfMonitoring:
+    enabled: false
+    grafanaAgent:
+      installOperator: false
+  lokiCanary:
+    enabled: false
+
+test:
+  enabled: false
--- a/base/monitoring/namespace.yaml
+++ b/base/monitoring/namespace.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: monitoring
--- a/base/monitoring/prometheus-values.yaml
+++ b/base/monitoring/prometheus-values.yaml
@@ -0,0 +1,96 @@
+# kube-prometheus-stack — Prometheus + AlertManager + Grafana + node-exporter + kube-state-metrics
+#
+# k3s quirks: kube-proxy is replaced by Cilium; etcd/scheduler/controller-manager
+# don't expose metrics on standard ports. Disable their monitors to avoid noise.
+
+grafana:
+  adminUser: admin
+  admin:
+    existingSecret: grafana-admin
+    passwordKey: admin-password
+  persistence:
+    enabled: true
+    size: 2Gi
+  # Inject Hydra OIDC client credentials (created by Hydra Maester from the OAuth2Client CRD)
+  envFromSecret: grafana-oidc
+  grafana.ini:
+    server:
+      root_url: "https://grafana.DOMAIN_SUFFIX"
+    auth:
+      # Keep local login as fallback (admin password from grafana-admin secret)
+      disable_login_form: false
+      signout_redirect_url: "https://auth.DOMAIN_SUFFIX/oauth2/sessions/logout"
+    auth.generic_oauth:
+      enabled: true
+      name: Sunbeam
+      icon: signin
+      # CLIENT_ID / CLIENT_SECRET injected from grafana-oidc K8s Secret via envFromSecret
+      client_id: "${CLIENT_ID}"
+      client_secret: "${CLIENT_SECRET}"
+      scopes: "openid email profile"
+      auth_url: "https://auth.DOMAIN_SUFFIX/oauth2/auth"
+      token_url: "https://auth.DOMAIN_SUFFIX/oauth2/token"
+      api_url: "https://auth.DOMAIN_SUFFIX/userinfo"
+      allow_sign_up: true
+      # Small studio — anyone with a valid La Suite account is an admin.
+      # To restrict to specific users, set role_attribute_path instead.
+      auto_assign_org_role: Admin
+      skip_org_role_sync: true
+  additionalDataSources:
+    - name: Loki
+      type: loki
+      url: http://loki.monitoring.svc.cluster.local:3100
+      access: proxy
+      isDefault: false
+    - name: Tempo
+      type: tempo
+      url: http://tempo.monitoring.svc.cluster.local:3100
+      access: proxy
+      isDefault: false
+
+prometheus:
+  prometheusSpec:
+    retention: 90d
+    storageSpec:
+      volumeClaimTemplate:
+        spec:
+          accessModes: [ReadWriteOnce]
+          resources:
+            requests:
+              storage: 30Gi
+
+alertmanager:
+  alertmanagerSpec:
+    storage:
+      volumeClaimTemplate:
+        spec:
+          accessModes: [ReadWriteOnce]
+          resources:
+            requests:
+              storage: 2Gi
+  config:
+    global:
+      smtp_from: "alerts@DOMAIN_SUFFIX"
+      smtp_smarthost: "postfix.lasuite.svc.cluster.local:25"
+      smtp_require_tls: false
+    route:
+      group_by: [alertname, namespace]
+      group_wait: 30s
+      group_interval: 5m
+      repeat_interval: 12h
+      receiver: email
+    receivers:
+      - name: email
+        email_configs:
+          - to: "ops@DOMAIN_SUFFIX"
+            send_resolved: true
+
+# Disable monitors for components k3s doesn't expose
+kubeEtcd:
+  enabled: false
+kubeControllerManager:
+  enabled: false
+kubeScheduler:
+  enabled: false
+kubeProxy:
+  enabled: false
--- a/base/monitoring/tempo-values.yaml
+++ b/base/monitoring/tempo-values.yaml
@@ -0,0 +1,26 @@
+# Tempo — monolithic single-binary, local filesystem backend.
+# Receives OTLP over gRPC (:4317) and HTTP (:4318).
+tempo:
+  reportingEnabled: false
+  receivers:
+    otlp:
+      protocols:
+        grpc:
+          endpoint: "0.0.0.0:4317"
+        http:
+          endpoint: "0.0.0.0:4318"
+  storage:
+    trace:
+      backend: local
+      local:
+        path: /var/tempo/traces
+      wal:
+        path: /var/tempo/wal
+
+persistence:
+  enabled: true
+  size: 20Gi
+
+# Expose OTLP ports as a ClusterIP service
+service:
+  type: ClusterIP
--- a/base/monitoring/vault-secrets.yaml
+++ b/base/monitoring/vault-secrets.yaml
@@ -0,0 +1,36 @@
+---
+apiVersion: secrets.hashicorp.com/v1beta1
+kind: VaultAuth
+metadata:
+  name: vso-auth
+  namespace: monitoring
+spec:
+  method: kubernetes
+  mount: kubernetes
+  kubernetes:
+    role: vso
+    serviceAccount: default
+---
+# Grafana admin password from OpenBao KV at secret/grafana.
+apiVersion: secrets.hashicorp.com/v1beta1
+kind: VaultStaticSecret
+metadata:
+  name: grafana-admin
+  namespace: monitoring
+spec:
+  vaultAuthRef: vso-auth
+  mount: secret
+  type: kv-v2
+  path: grafana
+  refreshAfter: 30s
+  destination:
+    name: grafana-admin
+    create: true
+    overwrite: true
+    transformation:
+      excludeRaw: true
+      templates:
+        admin-password:
+          text: "{{ index .Secrets \"admin-password\" }}"
+        admin-user:
+          text: "admin"