feat(infra): production bootstrap — cert-manager, longhorn, monitoring

Add new bases for cert-manager (Let's Encrypt + wildcard cert), Longhorn distributed storage, and monitoring (kube-prometheus-stack + Loki + Tempo + Grafana OIDC). Add cloud-init for Scaleway Elastic Metal provisioning. Production overlay: add patches for postgres sizing, SeaweedFS volume, OpenSearch storage, LiveKit service, Pingora host ports, resource limits, and CNPG daily barman backups. Update cert-manager.yaml with full dnsNames for all *.sunbeam.pt subdomains.
2026-03-06 12:06:27 +00:00
parent f7774558e9
commit 7ff35d3e0c
23 changed files with 855 additions and 35 deletions
--- a/base/cert-manager/kustomization.yaml
+++ b/base/cert-manager/kustomization.yaml
@@ -0,0 +1,15 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - namespace.yaml
+
+helmCharts:
+  # helm repo add jetstack https://charts.jetstack.io
+  - name: cert-manager
+    repo: https://charts.jetstack.io
+    version: "1.19.4"
+    releaseName: cert-manager
+    namespace: cert-manager
+    valuesFile: values.yaml
+    includeCRDs: true
--- a/base/cert-manager/namespace.yaml
+++ b/base/cert-manager/namespace.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: cert-manager
--- a/base/cert-manager/values.yaml
+++ b/base/cert-manager/values.yaml
@@ -0,0 +1,2 @@
+crds:
+  enabled: true
--- a/base/longhorn/kustomization.yaml
+++ b/base/longhorn/kustomization.yaml
@@ -0,0 +1,13 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+resources:
+  - namespace.yaml
+
+helmCharts:
+  - name: longhorn
+    repo: https://charts.longhorn.io
+    version: "1.11.0"
+    releaseName: longhorn
+    namespace: longhorn-system
+    valuesFile: values.yaml
--- a/base/longhorn/namespace.yaml
+++ b/base/longhorn/namespace.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: longhorn-system
--- a/base/longhorn/values.yaml
+++ b/base/longhorn/values.yaml
@@ -0,0 +1,24 @@
+# Longhorn distributed block storage
+# Single-node production deployment — RAID1 hardware already provides redundancy.
+
+defaultSettings:
+  # 1 replica: RAID1 mirrors the disk, so software replication adds no benefit
+  # on a single node and would halve available capacity.
+  defaultReplicaCount: 1
+
+  # Orphan auto-deletion: clean up node/instance orphaned resources automatically
+  orphanAutoDeletion: true
+
+  # Allow volumes to be scheduled on the only available node even when disk
+  # pressure is detected (single-node: no other node to reschedule to).
+  allowVolumeCreationWithDegradedAvailability: true
+
+  # Reduce reserved percentage to 10% — RAID1 hardware provides physical redundancy,
+  # so Longhorn doesn't need to hold back 30% for software replicas.
+  # With 937 GiB disk: 843 GiB schedulable (600 SW + 100 PG + ~143 headroom).
+  storageReservedPercentageForDefaultDisk: 10
+
+# Set Longhorn as the default StorageClass.
+persistence:
+  defaultClass: true
+  defaultClassReplicaCount: 1
--- a/base/monitoring/grafana-oauth2client.yaml
+++ b/base/monitoring/grafana-oauth2client.yaml
@@ -0,0 +1,32 @@
+# Hydra OAuth2Client for Grafana OIDC sign-in.
+#
+# Hydra Maester watches this CRD and:
+#   1. Registers the client with Hydra
+#   2. Creates K8s Secret "grafana-oidc" in monitoring namespace
+#      with CLIENT_ID and CLIENT_SECRET keys.
+#
+# Grafana picks up the secret via envFromSecret and interpolates
+# ${CLIENT_ID} / ${CLIENT_SECRET} in grafana.ini at startup.
+#
+# DOMAIN_SUFFIX is substituted by sunbeam apply.
+---
+apiVersion: hydra.ory.sh/v1alpha1
+kind: OAuth2Client
+metadata:
+  name: grafana
+  namespace: monitoring
+spec:
+  clientName: Grafana
+  grantTypes:
+    - authorization_code
+    - refresh_token
+  responseTypes:
+    - code
+  scope: openid email profile
+  redirectUris:
+    - https://grafana.DOMAIN_SUFFIX/login/generic_oauth
+  postLogoutRedirectUris:
+    - https://grafana.DOMAIN_SUFFIX/
+  tokenEndpointAuthMethod: client_secret_post
+  secretName: grafana-oidc
+  skipConsent: true
--- a/base/monitoring/kustomization.yaml
+++ b/base/monitoring/kustomization.yaml
@@ -0,0 +1,34 @@
+apiVersion: kustomize.config.k8s.io/v1beta1
+kind: Kustomization
+
+namespace: monitoring
+
+resources:
+  - namespace.yaml
+  - vault-secrets.yaml
+  - grafana-oauth2client.yaml
+
+helmCharts:
+  # helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
+  - name: kube-prometheus-stack
+    repo: https://prometheus-community.github.io/helm-charts
+    version: "82.9.0"
+    releaseName: kube-prometheus-stack
+    namespace: monitoring
+    valuesFile: prometheus-values.yaml
+    includeCRDs: true
+
+  # helm repo add grafana https://grafana.github.io/helm-charts
+  - name: loki
+    repo: https://grafana.github.io/helm-charts
+    version: "6.53.0"
+    releaseName: loki
+    namespace: monitoring
+    valuesFile: loki-values.yaml
+
+  - name: tempo
+    repo: https://grafana.github.io/helm-charts
+    version: "1.24.4"
+    releaseName: tempo
+    namespace: monitoring
+    valuesFile: tempo-values.yaml
--- a/base/monitoring/loki-values.yaml
+++ b/base/monitoring/loki-values.yaml
@@ -0,0 +1,43 @@
+# Loki — monolithic single-binary mode, filesystem storage, single tenant.
+deploymentMode: SingleBinary
+
+loki:
+  auth_enabled: false
+  commonConfig:
+    replication_factor: 1
+  storage:
+    type: filesystem
+  schemaConfig:
+    configs:
+      - from: "2024-01-01"
+        store: tsdb
+        object_store: filesystem
+        schema: v13
+        index:
+          prefix: index_
+          period: 24h
+
+singleBinary:
+  replicas: 1
+  persistence:
+    enabled: true
+    size: 30Gi
+
+# Disable sub-charts/probes not needed for single-node
+backend:
+  replicas: 0
+read:
+  replicas: 0
+write:
+  replicas: 0
+
+monitoring:
+  selfMonitoring:
+    enabled: false
+    grafanaAgent:
+      installOperator: false
+  lokiCanary:
+    enabled: false
+
+test:
+  enabled: false
--- a/base/monitoring/namespace.yaml
+++ b/base/monitoring/namespace.yaml
@@ -0,0 +1,4 @@
+apiVersion: v1
+kind: Namespace
+metadata:
+  name: monitoring
--- a/base/monitoring/prometheus-values.yaml
+++ b/base/monitoring/prometheus-values.yaml
@@ -0,0 +1,96 @@
+# kube-prometheus-stack — Prometheus + AlertManager + Grafana + node-exporter + kube-state-metrics
+#
+# k3s quirks: kube-proxy is replaced by Cilium; etcd/scheduler/controller-manager
+# don't expose metrics on standard ports. Disable their monitors to avoid noise.
+
+grafana:
+  adminUser: admin
+  admin:
+    existingSecret: grafana-admin
+    passwordKey: admin-password
+  persistence:
+    enabled: true
+    size: 2Gi
+  # Inject Hydra OIDC client credentials (created by Hydra Maester from the OAuth2Client CRD)
+  envFromSecret: grafana-oidc
+  grafana.ini:
+    server:
+      root_url: "https://grafana.DOMAIN_SUFFIX"
+    auth:
+      # Keep local login as fallback (admin password from grafana-admin secret)
+      disable_login_form: false
+      signout_redirect_url: "https://auth.DOMAIN_SUFFIX/oauth2/sessions/logout"
+    auth.generic_oauth:
+      enabled: true
+      name: Sunbeam
+      icon: signin
+      # CLIENT_ID / CLIENT_SECRET injected from grafana-oidc K8s Secret via envFromSecret
+      client_id: "${CLIENT_ID}"
+      client_secret: "${CLIENT_SECRET}"
+      scopes: "openid email profile"
+      auth_url: "https://auth.DOMAIN_SUFFIX/oauth2/auth"
+      token_url: "https://auth.DOMAIN_SUFFIX/oauth2/token"
+      api_url: "https://auth.DOMAIN_SUFFIX/userinfo"
+      allow_sign_up: true
+      # Small studio — anyone with a valid La Suite account is an admin.
+      # To restrict to specific users, set role_attribute_path instead.
+      auto_assign_org_role: Admin
+      skip_org_role_sync: true
+  additionalDataSources:
+    - name: Loki
+      type: loki
+      url: http://loki.monitoring.svc.cluster.local:3100
+      access: proxy
+      isDefault: false
+    - name: Tempo
+      type: tempo
+      url: http://tempo.monitoring.svc.cluster.local:3100
+      access: proxy
+      isDefault: false
+
+prometheus:
+  prometheusSpec:
+    retention: 90d
+    storageSpec:
+      volumeClaimTemplate:
+        spec:
+          accessModes: [ReadWriteOnce]
+          resources:
+            requests:
+              storage: 30Gi
+
+alertmanager:
+  alertmanagerSpec:
+    storage:
+      volumeClaimTemplate:
+        spec:
+          accessModes: [ReadWriteOnce]
+          resources:
+            requests:
+              storage: 2Gi
+  config:
+    global:
+      smtp_from: "alerts@DOMAIN_SUFFIX"
+      smtp_smarthost: "postfix.lasuite.svc.cluster.local:25"
+      smtp_require_tls: false
+    route:
+      group_by: [alertname, namespace]
+      group_wait: 30s
+      group_interval: 5m
+      repeat_interval: 12h
+      receiver: email
+    receivers:
+      - name: email
+        email_configs:
+          - to: "ops@DOMAIN_SUFFIX"
+            send_resolved: true
+
+# Disable monitors for components k3s doesn't expose
+kubeEtcd:
+  enabled: false
+kubeControllerManager:
+  enabled: false
+kubeScheduler:
+  enabled: false
+kubeProxy:
+  enabled: false
--- a/base/monitoring/tempo-values.yaml
+++ b/base/monitoring/tempo-values.yaml
@@ -0,0 +1,26 @@
+# Tempo — monolithic single-binary, local filesystem backend.
+# Receives OTLP over gRPC (:4317) and HTTP (:4318).
+tempo:
+  reportingEnabled: false
+  receivers:
+    otlp:
+      protocols:
+        grpc:
+          endpoint: "0.0.0.0:4317"
+        http:
+          endpoint: "0.0.0.0:4318"
+  storage:
+    trace:
+      backend: local
+      local:
+        path: /var/tempo/traces
+      wal:
+        path: /var/tempo/wal
+
+persistence:
+  enabled: true
+  size: 20Gi
+
+# Expose OTLP ports as a ClusterIP service
+service:
+  type: ClusterIP
--- a/base/monitoring/vault-secrets.yaml
+++ b/base/monitoring/vault-secrets.yaml
@@ -0,0 +1,36 @@
+---
+apiVersion: secrets.hashicorp.com/v1beta1
+kind: VaultAuth
+metadata:
+  name: vso-auth
+  namespace: monitoring
+spec:
+  method: kubernetes
+  mount: kubernetes
+  kubernetes:
+    role: vso
+    serviceAccount: default
+---
+# Grafana admin password from OpenBao KV at secret/grafana.
+apiVersion: secrets.hashicorp.com/v1beta1
+kind: VaultStaticSecret
+metadata:
+  name: grafana-admin
+  namespace: monitoring
+spec:
+  vaultAuthRef: vso-auth
+  mount: secret
+  type: kv-v2
+  path: grafana
+  refreshAfter: 30s
+  destination:
+    name: grafana-admin
+    create: true
+    overwrite: true
+    transformation:
+      excludeRaw: true
+      templates:
+        admin-password:
+          text: "{{ index .Secrets \"admin-password\" }}"
+        admin-user:
+          text: "admin"
--- a/cloud-init.yaml
+++ b/cloud-init.yaml
@@ -0,0 +1,47 @@
+#cloud-config
+# Scaleway Elastic Metal — latest Debian
+# Provisions: sienna user w/ GitHub SSH keys, k3s (traefik disabled)
+
+users:
+  - name: sienna
+    groups: [sudo]
+    shell: /bin/bash
+    sudo: "ALL=(ALL) NOPASSWD:ALL"
+    ssh_import_id:
+      - gh:siennathesane
+
+# Lock root and default debian user from password auth (SSH keys only)
+disable_root: true
+
+package_update: true
+package_upgrade: true
+
+packages:
+  - curl
+  - ca-certificates
+  - jq
+
+# Write k3s config before the installer runs so traefik is never started
+write_files:
+  - path: /etc/rancher/k3s/config.yaml
+    owner: root:root
+    permissions: "0644"
+    content: |
+      disable:
+        - traefik
+
+runcmd:
+  # Install k3s (picks up /etc/rancher/k3s/config.yaml automatically)
+  - curl -sfL https://get.k3s.io | sh -
+  # Allow sienna to use kubectl without sudo
+  - mkdir -p /home/sienna/.kube
+  - cp /etc/rancher/k3s/k3s.yaml /home/sienna/.kube/config
+  - chown -R sienna:sienna /home/sienna/.kube
+  - chmod 600 /home/sienna/.kube/config
+  # Fix server address in kubeconfig (loopback is fine for local use)
+  - systemctl enable --now k3s
+
+final_message: |
+  Sunbeam node ready. k3s installed, traefik disabled.
+  SSH: ssh sienna@<server-ip>
+  kubectl: KUBECONFIG=~/.kube/config kubectl get nodes
--- a/overlays/production/cert-manager.yaml
+++ b/overlays/production/cert-manager.yaml
@@ -1,18 +1,30 @@
-# cert-manager resources for production TLS.
+# cert-manager issuers and certificate for production TLS.
 #
-# Prerequisites:
-#   cert-manager must be installed in the cluster before applying this overlay:
-#   kubectl apply -f https://github.com/cert-manager/cert-manager/releases/latest/download/cert-manager.yaml
+# WORKFLOW: start with letsencrypt-staging to verify the HTTP-01 challenge
+# flow works without burning production rate limits. Once the staging cert
+# is issued successfully, flip the Certificate issuerRef to letsencrypt-production
+# and delete the old Secret so cert-manager re-issues with a trusted cert.
 #
-# DOMAIN_SUFFIX and ACME_EMAIL are substituted by sed at deploy time.
-# See overlays/production/kustomization.yaml for the deploy command.
+# ACME_EMAIL is substituted by sunbeam apply.
 ---
-# ClusterIssuer: Let's Encrypt production via HTTP-01 challenge.
-#
-# cert-manager creates one Ingress per challenged domain.  The pingora proxy
-# watches these Ingresses and routes /.well-known/acme-challenge/<token>
-# requests to the per-domain solver Service, so multi-SAN certificates are
-# issued correctly even when all domain challenges run in parallel.
+# Let's Encrypt staging — untrusted cert but no rate limits. Use for initial setup.
+apiVersion: cert-manager.io/v1
+kind: ClusterIssuer
+metadata:
+  name: letsencrypt-staging
+spec:
+  acme:
+    server: https://acme-staging-v02.api.letsencrypt.org/directory
+    email: ACME_EMAIL
+    privateKeySecretRef:
+      name: letsencrypt-staging-account-key
+    solvers:
+      - http01:
+          ingress:
+            serviceType: ClusterIP
+---
+# Let's Encrypt production — trusted cert, strict rate limits.
+# Switch to this once staging confirms challenges resolve correctly.
 apiVersion: cert-manager.io/v1
 kind: ClusterIssuer
 metadata:
@@ -26,16 +38,11 @@ spec:
    solvers:
      - http01:
          ingress:
-            # ingressClassName is intentionally blank: cert-manager still creates
-            # the Ingress object (which the proxy watches), but no ingress
-            # controller needs to act on it — the proxy handles routing itself.
-            ingressClassName: ""
+            serviceType: ClusterIP
 ---
-# Certificate: single multi-SAN cert covering all proxy subdomains.
-# cert-manager issues it via HTTP-01, stores it in pingora-tls Secret, and
-# renews it automatically ~30 days before expiry.  The watcher in sunbeam-proxy
-# detects the Secret update and triggers a graceful upgrade so the new cert is
-# loaded without dropping any connections.
+# Certificate covering all proxy subdomains.
+# Start with letsencrypt-staging. Once verified, change issuerRef.name to
+# letsencrypt-production and delete the pingora-tls Secret to force re-issue.
 apiVersion: cert-manager.io/v1
 kind: Certificate
 metadata:
@@ -56,3 +63,6 @@ spec:
    - src.DOMAIN_SUFFIX
    - auth.DOMAIN_SUFFIX
    - s3.DOMAIN_SUFFIX
+    - grafana.DOMAIN_SUFFIX
+    - admin.DOMAIN_SUFFIX
+    - integration.DOMAIN_SUFFIX
--- a/overlays/production/kustomization.yaml
+++ b/overlays/production/kustomization.yaml
@@ -3,14 +3,12 @@ kind: Kustomization

 # Production overlay — targets Scaleway Elastic Metal (Paris)
 #
-# Deploy (DOMAIN_SUFFIX and ACME_EMAIL are substituted by sed):
-#   DOMAIN="yourdomain.com" EMAIL="ops@yourdomain.com"
-#   kustomize build overlays/production/ \
-#     | sed -e "s/DOMAIN_SUFFIX/${DOMAIN}/g" -e "s/ACME_EMAIL/${EMAIL}/g" \
-#     | kubectl apply --server-side --force-conflicts -f -
+# Deploy (DOMAIN_SUFFIX and ACME_EMAIL are substituted by sunbeam apply):
+#   sunbeam apply --env production --domain yourdomain.com

 resources:
-  - ../../base/mesh
+  - ../../base/longhorn
+  - ../../base/cert-manager
  - ../../base/ingress
  - ../../base/ory
  - ../../base/data
@@ -18,20 +16,42 @@ resources:
  - ../../base/lasuite
  - ../../base/media
  - ../../base/devtools
+  - ../../base/vso
+  - ../../base/monitoring
  # cert-manager ClusterIssuer + Certificate (requires cert-manager to be installed)
  - cert-manager.yaml
+  # CNPG daily backup schedule
+  - postgres-scheduled-backup.yaml

 images:
-  # Set to your container registry. DOMAIN_SUFFIX is substituted by sed.
-  - name: sunbeam-proxy
-    newName: src.DOMAIN_SUFFIX/sunbeam/sunbeam-proxy
+  # La Gaufre integration service — built and pushed by `sunbeam build integration`
+  - name: integration
+    newName: src.DOMAIN_SUFFIX/studio/integration
+    newTag: latest
+
+  # Meet — built from source and pushed to Gitea registry.
+  - name: meet-backend
+    newName: src.DOMAIN_SUFFIX/studio/meet-backend
+    newTag: latest
+  - name: meet-frontend
+    newName: src.DOMAIN_SUFFIX/studio/meet-frontend
    newTag: latest

 patches:
-  - path: values-pingora.yaml
+  # Pingora host ports — bind :80/:443 to the host network
+  - path: patch-pingora-hostport.yaml

-  # TODO: set OIDC redirect URIs to https://*.sunbeam.pt/...
-  # - path: values-ory.yaml
+  # Production resource limits for 64 GiB server
+  - path: values-resources.yaml

-  # TODO: set production resource limits (64 GB server)
-  # - path: values-resources.yaml
+  # LiveKit TURN service: ClusterIP (Pingora routes TURN traffic on :443)
+  - path: patch-livekit-service.yaml
+
+  # CNPG: production sizing (500 Gi, 8 Gi RAM) + barman S3 backup config
+  - path: patch-postgres-production.yaml
+
+  # OpenSearch: expand PVC to 50 Gi
+  - path: patch-opensearch-storage.yaml
+
+  # SeaweedFS volume: expand PVC to 600 Gi
+  - path: patch-seaweedfs-volume-size.yaml
--- a/overlays/production/patch-livekit-service.yaml
+++ b/overlays/production/patch-livekit-service.yaml
@@ -0,0 +1,10 @@
+# Patch: keep LiveKit TURN service as ClusterIP — Pingora routes external TURN traffic.
+# Without this patch, klipper-lb (disabled) or the default LoadBalancer type may
+# conflict with Pingora's host port bindings on port 443.
+apiVersion: v1
+kind: Service
+metadata:
+  name: livekit-server-turn
+  namespace: media
+spec:
+  type: ClusterIP
--- a/overlays/production/patch-opensearch-storage.yaml
+++ b/overlays/production/patch-opensearch-storage.yaml
@@ -0,0 +1,10 @@
+# Expand OpenSearch PVC to 50 Gi in production.
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: opensearch-data
+  namespace: data
+spec:
+  resources:
+    requests:
+      storage: 50Gi
--- a/overlays/production/patch-pingora-hostport.yaml
+++ b/overlays/production/patch-pingora-hostport.yaml
@@ -0,0 +1,25 @@
+# Bind Pingora container ports to the host network so external traffic
+# on ports 80 and 443 reaches the proxy pod directly.
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: pingora
+  namespace: ingress
+spec:
+  template:
+    spec:
+      containers:
+        - name: pingora
+          ports:
+            - name: http
+              containerPort: 80
+              hostPort: 80
+              protocol: TCP
+            - name: https
+              containerPort: 443
+              hostPort: 443
+              protocol: TCP
+            - name: ssh
+              containerPort: 22
+              hostPort: 22
+              protocol: TCP
--- a/overlays/production/patch-postgres-production.yaml
+++ b/overlays/production/patch-postgres-production.yaml
@@ -0,0 +1,45 @@
+# Production CNPG cluster sizing for 12-core, 64 GiB Elastic Metal.
+# Barman backs up WAL + base backups to Scaleway Object Storage (s3://sunbeam-backups/postgres).
+# barman-s3-creds K8s Secret is synced by VSO from secret/scaleway-s3 in OpenBao.
+apiVersion: postgresql.cnpg.io/v1
+kind: Cluster
+metadata:
+  name: postgres
+  namespace: data
+spec:
+  instances: 1
+
+  postgresql:
+    parameters:
+      max_connections: "200"
+      shared_buffers: "2GB"
+      effective_cache_size: "6GB"
+      work_mem: "16MB"
+      maintenance_work_mem: "512MB"
+
+  storage:
+    size: 100Gi
+
+  resources:
+    requests:
+      memory: 4Gi
+      cpu: "2"
+    limits:
+      memory: 8Gi
+
+  backup:
+    barmanObjectStore:
+      destinationPath: "s3://sunbeam-backups/postgres"
+      endpointURL: "https://s3.fr-par.scw.cloud"
+      s3Credentials:
+        accessKeyId:
+          name: barman-s3-creds
+          key: ACCESS_KEY_ID
+        secretAccessKey:
+          name: barman-s3-creds
+          key: ACCESS_SECRET_KEY
+      wal:
+        compression: gzip
+      data:
+        compression: gzip
+    retentionPolicy: "30d"
--- a/overlays/production/patch-seaweedfs-volume-size.yaml
+++ b/overlays/production/patch-seaweedfs-volume-size.yaml
@@ -0,0 +1,15 @@
+# Expand SeaweedFS volume PVC to 600 Gi in production.
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  name: seaweedfs-volume
+  namespace: storage
+spec:
+  volumeClaimTemplates:
+    - metadata:
+        name: data
+      spec:
+        accessModes: [ReadWriteOnce]
+        resources:
+          requests:
+            storage: 600Gi
--- a/overlays/production/postgres-scheduled-backup.yaml
+++ b/overlays/production/postgres-scheduled-backup.yaml
@@ -0,0 +1,12 @@
+apiVersion: postgresql.cnpg.io/v1
+kind: ScheduledBackup
+metadata:
+  name: postgres-daily
+  namespace: data
+spec:
+  # Daily at 02:00 UTC
+  schedule: "0 2 * * *"
+  backupOwnerReference: self
+  cluster:
+    name: postgres
+  method: barmanObjectStore
--- a/overlays/production/values-resources.yaml
+++ b/overlays/production/values-resources.yaml
@@ -0,0 +1,293 @@
+# Production resource limits — Scaleway Elastic Metal, 12 cores, 64 GiB RAM.
+# ~10 GiB reserved for OS + k3s + Linkerd mesh overhead.
+# Replicas scaled up for production workloads.
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: meet-celery-worker
+  namespace: lasuite
+spec:
+  template:
+    spec:
+      containers:
+        - name: meet-celery-worker
+          resources:
+            requests:
+              memory: 256Mi
+              cpu: 100m
+            limits:
+              memory: 1Gi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: cloudnative-pg
+  namespace: data
+spec:
+  template:
+    spec:
+      containers:
+        - name: manager
+          resources:
+            requests:
+              memory: 256Mi
+              cpu: 100m
+            limits:
+              memory: 512Mi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: livekit-server
+  namespace: media
+spec:
+  template:
+    spec:
+      containers:
+        - name: livekit-server
+          resources:
+            requests:
+              memory: 512Mi
+              cpu: 500m
+            limits:
+              memory: 2Gi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: pingora
+  namespace: ingress
+spec:
+  template:
+    spec:
+      containers:
+        - name: pingora
+          resources:
+            requests:
+              memory: 128Mi
+              cpu: 250m
+            limits:
+              memory: 512Mi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: valkey
+  namespace: data
+spec:
+  template:
+    spec:
+      containers:
+        - name: valkey
+          resources:
+            requests:
+              memory: 128Mi
+              cpu: 50m
+            limits:
+              memory: 512Mi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: opensearch
+  namespace: data
+spec:
+  template:
+    spec:
+      containers:
+        - name: opensearch
+          env:
+            - name: OPENSEARCH_JAVA_OPTS
+              value: "-Xms2g -Xmx4g"
+          resources:
+            requests:
+              memory: 2Gi
+              cpu: 500m
+            limits:
+              memory: 5Gi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: seaweedfs-filer
+  namespace: storage
+spec:
+  template:
+    spec:
+      containers:
+        - name: filer
+          resources:
+            requests:
+              memory: 256Mi
+              cpu: 100m
+            limits:
+              memory: 1Gi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: hydra-hydra-maester
+  namespace: ory
+spec:
+  template:
+    spec:
+      containers:
+        - name: hydra-maester
+          resources:
+            requests:
+              memory: 32Mi
+              cpu: 25m
+            limits:
+              memory: 128Mi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: login-ui
+  namespace: ory
+spec:
+  template:
+    spec:
+      containers:
+        - name: login-ui
+          resources:
+            requests:
+              memory: 128Mi
+              cpu: 50m
+            limits:
+              memory: 384Mi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: hive
+  namespace: lasuite
+spec:
+  template:
+    spec:
+      containers:
+        - name: hive
+          resources:
+            requests:
+              memory: 64Mi
+            limits:
+              memory: 256Mi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: people-backend
+  namespace: lasuite
+spec:
+  replicas: 2
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: people-celery-worker
+  namespace: lasuite
+spec:
+  replicas: 2
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: people-frontend
+  namespace: lasuite
+spec:
+  replicas: 2
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: docs-celery-worker
+  namespace: lasuite
+spec:
+  replicas: 2
+  template:
+    spec:
+      containers:
+        - name: docs
+          env:
+            - name: CELERY_WORKER_CONCURRENCY
+              value: "4"
+          resources:
+            requests:
+              memory: 512Mi
+              cpu: 250m
+            limits:
+              memory: 1Gi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: docs-backend
+  namespace: lasuite
+spec:
+  replicas: 2
+  template:
+    spec:
+      containers:
+        - name: docs
+          env:
+            - name: WEB_CONCURRENCY
+              value: "4"
+          resources:
+            requests:
+              memory: 512Mi
+              cpu: 250m
+            limits:
+              memory: 1Gi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: docs-frontend
+  namespace: lasuite
+spec:
+  replicas: 2
+  template:
+    spec:
+      containers:
+        - name: docs
+          resources:
+            requests:
+              memory: 64Mi
+            limits:
+              memory: 256Mi
+
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: docs-y-provider
+  namespace: lasuite
+spec:
+  replicas: 1
+  template:
+    spec:
+      containers:
+        - name: docs
+          resources:
+            requests:
+              memory: 256Mi
+              cpu: 100m
+            limits:
+              memory: 1Gi