feat(infra): production bootstrap — cert-manager, longhorn, monitoring

Add new bases for cert-manager (Let's Encrypt + wildcard cert), Longhorn distributed storage, and monitoring (kube-prometheus-stack + Loki + Tempo + Grafana OIDC). Add cloud-init for Scaleway Elastic Metal provisioning. Production overlay: add patches for postgres sizing, SeaweedFS volume, OpenSearch storage, LiveKit service, Pingora host ports, resource limits, and CNPG daily barman backups. Update cert-manager.yaml with full dnsNames for all *.sunbeam.pt subdomains.
2026-03-06 12:06:27 +00:00
parent f7774558e9
commit 7ff35d3e0c
23 changed files with 855 additions and 35 deletions
--- a/overlays/production/cert-manager.yaml
+++ b/overlays/production/cert-manager.yaml
@@ -1,18 +1,30 @@
-# cert-manager resources for production TLS.
+# cert-manager issuers and certificate for production TLS.
 #
-# Prerequisites:
-#   cert-manager must be installed in the cluster before applying this overlay:
-#   kubectl apply -f https://github.com/cert-manager/cert-manager/releases/latest/download/cert-manager.yaml
+# WORKFLOW: start with letsencrypt-staging to verify the HTTP-01 challenge
+# flow works without burning production rate limits. Once the staging cert
+# is issued successfully, flip the Certificate issuerRef to letsencrypt-production
+# and delete the old Secret so cert-manager re-issues with a trusted cert.
 #
-# DOMAIN_SUFFIX and ACME_EMAIL are substituted by sed at deploy time.
-# See overlays/production/kustomization.yaml for the deploy command.
+# ACME_EMAIL is substituted by sunbeam apply.
 ---
-# ClusterIssuer: Let's Encrypt production via HTTP-01 challenge.
-#
-# cert-manager creates one Ingress per challenged domain.  The pingora proxy
-# watches these Ingresses and routes /.well-known/acme-challenge/<token>
-# requests to the per-domain solver Service, so multi-SAN certificates are
-# issued correctly even when all domain challenges run in parallel.
+# Let's Encrypt staging — untrusted cert but no rate limits. Use for initial setup.
+apiVersion: cert-manager.io/v1
+kind: ClusterIssuer
+metadata:
+  name: letsencrypt-staging
+spec:
+  acme:
+    server: https://acme-staging-v02.api.letsencrypt.org/directory
+    email: ACME_EMAIL
+    privateKeySecretRef:
+      name: letsencrypt-staging-account-key
+    solvers:
+      - http01:
+          ingress:
+            serviceType: ClusterIP
+---
+# Let's Encrypt production — trusted cert, strict rate limits.
+# Switch to this once staging confirms challenges resolve correctly.
 apiVersion: cert-manager.io/v1
 kind: ClusterIssuer
 metadata:
@@ -26,16 +38,11 @@ spec:
    solvers:
      - http01:
          ingress:
-            # ingressClassName is intentionally blank: cert-manager still creates
-            # the Ingress object (which the proxy watches), but no ingress
-            # controller needs to act on it — the proxy handles routing itself.
-            ingressClassName: ""
+            serviceType: ClusterIP
 ---
-# Certificate: single multi-SAN cert covering all proxy subdomains.
-# cert-manager issues it via HTTP-01, stores it in pingora-tls Secret, and
-# renews it automatically ~30 days before expiry.  The watcher in sunbeam-proxy
-# detects the Secret update and triggers a graceful upgrade so the new cert is
-# loaded without dropping any connections.
+# Certificate covering all proxy subdomains.
+# Start with letsencrypt-staging. Once verified, change issuerRef.name to
+# letsencrypt-production and delete the pingora-tls Secret to force re-issue.
 apiVersion: cert-manager.io/v1
 kind: Certificate
 metadata:
@@ -56,3 +63,6 @@ spec:
    - src.DOMAIN_SUFFIX
    - auth.DOMAIN_SUFFIX
    - s3.DOMAIN_SUFFIX
+    - grafana.DOMAIN_SUFFIX
+    - admin.DOMAIN_SUFFIX
+    - integration.DOMAIN_SUFFIX
--- a/overlays/production/kustomization.yaml
+++ b/overlays/production/kustomization.yaml
@@ -3,14 +3,12 @@ kind: Kustomization

 # Production overlay — targets Scaleway Elastic Metal (Paris)
 #
-# Deploy (DOMAIN_SUFFIX and ACME_EMAIL are substituted by sed):
-#   DOMAIN="yourdomain.com" EMAIL="ops@yourdomain.com"
-#   kustomize build overlays/production/ \
-#     | sed -e "s/DOMAIN_SUFFIX/${DOMAIN}/g" -e "s/ACME_EMAIL/${EMAIL}/g" \
-#     | kubectl apply --server-side --force-conflicts -f -
+# Deploy (DOMAIN_SUFFIX and ACME_EMAIL are substituted by sunbeam apply):
+#   sunbeam apply --env production --domain yourdomain.com

 resources:
-  - ../../base/mesh
+  - ../../base/longhorn
+  - ../../base/cert-manager
  - ../../base/ingress
  - ../../base/ory
  - ../../base/data
@@ -18,20 +16,42 @@ resources:
  - ../../base/lasuite
  - ../../base/media
  - ../../base/devtools
+  - ../../base/vso
+  - ../../base/monitoring
  # cert-manager ClusterIssuer + Certificate (requires cert-manager to be installed)
  - cert-manager.yaml
+  # CNPG daily backup schedule
+  - postgres-scheduled-backup.yaml

 images:
-  # Set to your container registry. DOMAIN_SUFFIX is substituted by sed.
-  - name: sunbeam-proxy
-    newName: src.DOMAIN_SUFFIX/sunbeam/sunbeam-proxy
+  # La Gaufre integration service — built and pushed by `sunbeam build integration`
+  - name: integration
+    newName: src.DOMAIN_SUFFIX/studio/integration
+    newTag: latest
+
+  # Meet — built from source and pushed to Gitea registry.
+  - name: meet-backend
+    newName: src.DOMAIN_SUFFIX/studio/meet-backend
+    newTag: latest
+  - name: meet-frontend
+    newName: src.DOMAIN_SUFFIX/studio/meet-frontend
    newTag: latest

 patches:
-  - path: values-pingora.yaml
+  # Pingora host ports — bind :80/:443 to the host network
+  - path: patch-pingora-hostport.yaml

-  # TODO: set OIDC redirect URIs to https://*.sunbeam.pt/...
-  # - path: values-ory.yaml
+  # Production resource limits for 64 GiB server
+  - path: values-resources.yaml

-  # TODO: set production resource limits (64 GB server)
-  # - path: values-resources.yaml
+  # LiveKit TURN service: ClusterIP (Pingora routes TURN traffic on :443)
+  - path: patch-livekit-service.yaml
+
+  # CNPG: production sizing (500 Gi, 8 Gi RAM) + barman S3 backup config
+  - path: patch-postgres-production.yaml
+
+  # OpenSearch: expand PVC to 50 Gi
+  - path: patch-opensearch-storage.yaml
+
+  # SeaweedFS volume: expand PVC to 600 Gi
+  - path: patch-seaweedfs-volume-size.yaml
--- a/overlays/production/patch-livekit-service.yaml
+++ b/overlays/production/patch-livekit-service.yaml
@@ -0,0 +1,10 @@
+# Patch: keep LiveKit TURN service as ClusterIP — Pingora routes external TURN traffic.
+# Without this patch, klipper-lb (disabled) or the default LoadBalancer type may
+# conflict with Pingora's host port bindings on port 443.
+apiVersion: v1
+kind: Service
+metadata:
+  name: livekit-server-turn
+  namespace: media
+spec:
+  type: ClusterIP
--- a/overlays/production/patch-opensearch-storage.yaml
+++ b/overlays/production/patch-opensearch-storage.yaml
@@ -0,0 +1,10 @@
+# Expand OpenSearch PVC to 50 Gi in production.
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: opensearch-data
+  namespace: data
+spec:
+  resources:
+    requests:
+      storage: 50Gi
--- a/overlays/production/patch-pingora-hostport.yaml
+++ b/overlays/production/patch-pingora-hostport.yaml
@@ -0,0 +1,25 @@
+# Bind Pingora container ports to the host network so external traffic
+# on ports 80 and 443 reaches the proxy pod directly.
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: pingora
+  namespace: ingress
+spec:
+  template:
+    spec:
+      containers:
+        - name: pingora
+          ports:
+            - name: http
+              containerPort: 80
+              hostPort: 80
+              protocol: TCP
+            - name: https
+              containerPort: 443
+              hostPort: 443
+              protocol: TCP
+            - name: ssh
+              containerPort: 22
+              hostPort: 22
+              protocol: TCP
--- a/overlays/production/patch-postgres-production.yaml
+++ b/overlays/production/patch-postgres-production.yaml
@@ -0,0 +1,45 @@
+# Production CNPG cluster sizing for 12-core, 64 GiB Elastic Metal.
+# Barman backs up WAL + base backups to Scaleway Object Storage (s3://sunbeam-backups/postgres).
+# barman-s3-creds K8s Secret is synced by VSO from secret/scaleway-s3 in OpenBao.
+apiVersion: postgresql.cnpg.io/v1
+kind: Cluster
+metadata:
+  name: postgres
+  namespace: data
+spec:
+  instances: 1
+
+  postgresql:
+    parameters:
+      max_connections: "200"
+      shared_buffers: "2GB"
+      effective_cache_size: "6GB"
+      work_mem: "16MB"
+      maintenance_work_mem: "512MB"
+
+  storage:
+    size: 100Gi
+
+  resources:
+    requests:
+      memory: 4Gi
+      cpu: "2"
+    limits:
+      memory: 8Gi
+
+  backup:
+    barmanObjectStore:
+      destinationPath: "s3://sunbeam-backups/postgres"
+      endpointURL: "https://s3.fr-par.scw.cloud"
+      s3Credentials:
+        accessKeyId:
+          name: barman-s3-creds
+          key: ACCESS_KEY_ID
+        secretAccessKey:
+          name: barman-s3-creds
+          key: ACCESS_SECRET_KEY
+      wal:
+        compression: gzip
+      data:
+        compression: gzip
+    retentionPolicy: "30d"
--- a/overlays/production/patch-seaweedfs-volume-size.yaml
+++ b/overlays/production/patch-seaweedfs-volume-size.yaml
@@ -0,0 +1,15 @@
+# Expand SeaweedFS volume PVC to 600 Gi in production.
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: seaweedfs-volume
+  namespace: storage
+spec:
+  volumeClaimTemplates:
+    - metadata:
+        name: data
+      spec:
+        accessModes: [ReadWriteOnce]
+        resources:
+          requests:
+            storage: 600Gi
--- a/overlays/production/postgres-scheduled-backup.yaml
+++ b/overlays/production/postgres-scheduled-backup.yaml
@@ -0,0 +1,12 @@
+apiVersion: postgresql.cnpg.io/v1
+kind: ScheduledBackup
+metadata:
+  name: postgres-daily
+  namespace: data
+spec:
+  # Daily at 02:00 UTC
+  schedule: "0 2 * * *"
+  backupOwnerReference: self
+  cluster:
+    name: postgres
+  method: barmanObjectStore
--- a/overlays/production/values-resources.yaml
+++ b/overlays/production/values-resources.yaml
@@ -0,0 +1,293 @@
+# Production resource limits — Scaleway Elastic Metal, 12 cores, 64 GiB RAM.
+# ~10 GiB reserved for OS + k3s + Linkerd mesh overhead.
+# Replicas scaled up for production workloads.
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: meet-celery-worker
+  namespace: lasuite
+spec:
+  template:
+    spec:
+      containers:
+        - name: meet-celery-worker
+          resources:
+            requests:
+              memory: 256Mi
+              cpu: 100m
+            limits:
+              memory: 1Gi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: cloudnative-pg
+  namespace: data
+spec:
+  template:
+    spec:
+      containers:
+        - name: manager
+          resources:
+            requests:
+              memory: 256Mi
+              cpu: 100m
+            limits:
+              memory: 512Mi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: livekit-server
+  namespace: media
+spec:
+  template:
+    spec:
+      containers:
+        - name: livekit-server
+          resources:
+            requests:
+              memory: 512Mi
+              cpu: 500m
+            limits:
+              memory: 2Gi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: pingora
+  namespace: ingress
+spec:
+  template:
+    spec:
+      containers:
+        - name: pingora
+          resources:
+            requests:
+              memory: 128Mi
+              cpu: 250m
+            limits:
+              memory: 512Mi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: valkey
+  namespace: data
+spec:
+  template:
+    spec:
+      containers:
+        - name: valkey
+          resources:
+            requests:
+              memory: 128Mi
+              cpu: 50m
+            limits:
+              memory: 512Mi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: opensearch
+  namespace: data
+spec:
+  template:
+    spec:
+      containers:
+        - name: opensearch
+          env:
+            - name: OPENSEARCH_JAVA_OPTS
+              value: "-Xms2g -Xmx4g"
+          resources:
+            requests:
+              memory: 2Gi
+              cpu: 500m
+            limits:
+              memory: 5Gi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: seaweedfs-filer
+  namespace: storage
+spec:
+  template:
+    spec:
+      containers:
+        - name: filer
+          resources:
+            requests:
+              memory: 256Mi
+              cpu: 100m
+            limits:
+              memory: 1Gi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: hydra-hydra-maester
+  namespace: ory
+spec:
+  template:
+    spec:
+      containers:
+        - name: hydra-maester
+          resources:
+            requests:
+              memory: 32Mi
+              cpu: 25m
+            limits:
+              memory: 128Mi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: login-ui
+  namespace: ory
+spec:
+  template:
+    spec:
+      containers:
+        - name: login-ui
+          resources:
+            requests:
+              memory: 128Mi
+              cpu: 50m
+            limits:
+              memory: 384Mi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: hive
+  namespace: lasuite
+spec:
+  template:
+    spec:
+      containers:
+        - name: hive
+          resources:
+            requests:
+              memory: 64Mi
+            limits:
+              memory: 256Mi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: people-backend
+  namespace: lasuite
+spec:
+  replicas: 2
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: people-celery-worker
+  namespace: lasuite
+spec:
+  replicas: 2
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: people-frontend
+  namespace: lasuite
+spec:
+  replicas: 2
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: docs-celery-worker
+  namespace: lasuite
+spec:
+  replicas: 2
+  template:
+    spec:
+      containers:
+        - name: docs
+          env:
+            - name: CELERY_WORKER_CONCURRENCY
+              value: "4"
+          resources:
+            requests:
+              memory: 512Mi
+              cpu: 250m
+            limits:
+              memory: 1Gi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: docs-backend
+  namespace: lasuite
+spec:
+  replicas: 2
+  template:
+    spec:
+      containers:
+        - name: docs
+          env:
+            - name: WEB_CONCURRENCY
+              value: "4"
+          resources:
+            requests:
+              memory: 512Mi
+              cpu: 250m
+            limits:
+              memory: 1Gi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: docs-frontend
+  namespace: lasuite
+spec:
+  replicas: 2
+  template:
+    spec:
+      containers:
+        - name: docs
+          resources:
+            requests:
+              memory: 64Mi
+            limits:
+              memory: 256Mi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: docs-y-provider
+  namespace: lasuite
+spec:
+  replicas: 1
+  template:
+    spec:
+      containers:
+        - name: docs
+          resources:
+            requests:
+              memory: 256Mi
+              cpu: 100m
+            limits:
+              memory: 1Gi