feat(infra): production bootstrap — cert-manager, longhorn, monitoring

Add new bases for cert-manager (Let's Encrypt + wildcard cert), Longhorn
distributed storage, and monitoring (kube-prometheus-stack + Loki + Tempo
+ Grafana OIDC). Add cloud-init for Scaleway Elastic Metal provisioning.

Production overlay: add patches for postgres sizing, SeaweedFS volume,
OpenSearch storage, LiveKit service, Pingora host ports, resource limits,
and CNPG daily barman backups. Update cert-manager.yaml with full dnsNames
for all *.sunbeam.pt subdomains.
This commit is contained in:
2026-03-06 12:06:27 +00:00
parent f7774558e9
commit 7ff35d3e0c
23 changed files with 855 additions and 35 deletions

View File

@@ -0,0 +1,15 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
helmCharts:
# helm repo add jetstack https://charts.jetstack.io
- name: cert-manager
repo: https://charts.jetstack.io
version: "1.19.4"
releaseName: cert-manager
namespace: cert-manager
valuesFile: values.yaml
includeCRDs: true

View File

@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: cert-manager

View File

@@ -0,0 +1,2 @@
crds:
enabled: true

View File

@@ -0,0 +1,13 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
resources:
- namespace.yaml
helmCharts:
- name: longhorn
repo: https://charts.longhorn.io
version: "1.11.0"
releaseName: longhorn
namespace: longhorn-system
valuesFile: values.yaml

View File

@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: longhorn-system

24
base/longhorn/values.yaml Normal file
View File

@@ -0,0 +1,24 @@
# Longhorn distributed block storage
# Single-node production deployment — RAID1 hardware already provides redundancy.
defaultSettings:
# 1 replica: RAID1 mirrors the disk, so software replication adds no benefit
# on a single node and would halve available capacity.
defaultReplicaCount: 1
# Orphan auto-deletion: clean up node/instance orphaned resources automatically
orphanAutoDeletion: true
# Allow volumes to be scheduled on the only available node even when disk
# pressure is detected (single-node: no other node to reschedule to).
allowVolumeCreationWithDegradedAvailability: true
# Reduce reserved percentage to 10% — RAID1 hardware provides physical redundancy,
# so Longhorn doesn't need to hold back 30% for software replicas.
# With 937 GiB disk: 843 GiB schedulable (600 SW + 100 PG + ~143 headroom).
storageReservedPercentageForDefaultDisk: 10
# Set Longhorn as the default StorageClass.
persistence:
defaultClass: true
defaultClassReplicaCount: 1

View File

@@ -0,0 +1,32 @@
# Hydra OAuth2Client for Grafana OIDC sign-in.
#
# Hydra Maester watches this CRD and:
# 1. Registers the client with Hydra
# 2. Creates K8s Secret "grafana-oidc" in monitoring namespace
# with CLIENT_ID and CLIENT_SECRET keys.
#
# Grafana picks up the secret via envFromSecret and interpolates
# ${CLIENT_ID} / ${CLIENT_SECRET} in grafana.ini at startup.
#
# DOMAIN_SUFFIX is substituted by sunbeam apply.
---
apiVersion: hydra.ory.sh/v1alpha1
kind: OAuth2Client
metadata:
name: grafana
namespace: monitoring
spec:
clientName: Grafana
grantTypes:
- authorization_code
- refresh_token
responseTypes:
- code
scope: openid email profile
redirectUris:
- https://grafana.DOMAIN_SUFFIX/login/generic_oauth
postLogoutRedirectUris:
- https://grafana.DOMAIN_SUFFIX/
tokenEndpointAuthMethod: client_secret_post
secretName: grafana-oidc
skipConsent: true

View File

@@ -0,0 +1,34 @@
apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization
namespace: monitoring
resources:
- namespace.yaml
- vault-secrets.yaml
- grafana-oauth2client.yaml
helmCharts:
# helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
- name: kube-prometheus-stack
repo: https://prometheus-community.github.io/helm-charts
version: "82.9.0"
releaseName: kube-prometheus-stack
namespace: monitoring
valuesFile: prometheus-values.yaml
includeCRDs: true
# helm repo add grafana https://grafana.github.io/helm-charts
- name: loki
repo: https://grafana.github.io/helm-charts
version: "6.53.0"
releaseName: loki
namespace: monitoring
valuesFile: loki-values.yaml
- name: tempo
repo: https://grafana.github.io/helm-charts
version: "1.24.4"
releaseName: tempo
namespace: monitoring
valuesFile: tempo-values.yaml

View File

@@ -0,0 +1,43 @@
# Loki — monolithic single-binary mode, filesystem storage, single tenant.
deploymentMode: SingleBinary
loki:
auth_enabled: false
commonConfig:
replication_factor: 1
storage:
type: filesystem
schemaConfig:
configs:
- from: "2024-01-01"
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
singleBinary:
replicas: 1
persistence:
enabled: true
size: 30Gi
# Disable sub-charts/probes not needed for single-node
backend:
replicas: 0
read:
replicas: 0
write:
replicas: 0
monitoring:
selfMonitoring:
enabled: false
grafanaAgent:
installOperator: false
lokiCanary:
enabled: false
test:
enabled: false

View File

@@ -0,0 +1,4 @@
apiVersion: v1
kind: Namespace
metadata:
name: monitoring

View File

@@ -0,0 +1,96 @@
# kube-prometheus-stack — Prometheus + AlertManager + Grafana + node-exporter + kube-state-metrics
#
# k3s quirks: kube-proxy is replaced by Cilium; etcd/scheduler/controller-manager
# don't expose metrics on standard ports. Disable their monitors to avoid noise.
grafana:
adminUser: admin
admin:
existingSecret: grafana-admin
passwordKey: admin-password
persistence:
enabled: true
size: 2Gi
# Inject Hydra OIDC client credentials (created by Hydra Maester from the OAuth2Client CRD)
envFromSecret: grafana-oidc
grafana.ini:
server:
root_url: "https://grafana.DOMAIN_SUFFIX"
auth:
# Keep local login as fallback (admin password from grafana-admin secret)
disable_login_form: false
signout_redirect_url: "https://auth.DOMAIN_SUFFIX/oauth2/sessions/logout"
auth.generic_oauth:
enabled: true
name: Sunbeam
icon: signin
# CLIENT_ID / CLIENT_SECRET injected from grafana-oidc K8s Secret via envFromSecret
client_id: "${CLIENT_ID}"
client_secret: "${CLIENT_SECRET}"
scopes: "openid email profile"
auth_url: "https://auth.DOMAIN_SUFFIX/oauth2/auth"
token_url: "https://auth.DOMAIN_SUFFIX/oauth2/token"
api_url: "https://auth.DOMAIN_SUFFIX/userinfo"
allow_sign_up: true
# Small studio — anyone with a valid La Suite account is an admin.
# To restrict to specific users, set role_attribute_path instead.
auto_assign_org_role: Admin
skip_org_role_sync: true
additionalDataSources:
- name: Loki
type: loki
url: http://loki.monitoring.svc.cluster.local:3100
access: proxy
isDefault: false
- name: Tempo
type: tempo
url: http://tempo.monitoring.svc.cluster.local:3100
access: proxy
isDefault: false
prometheus:
prometheusSpec:
retention: 90d
storageSpec:
volumeClaimTemplate:
spec:
accessModes: [ReadWriteOnce]
resources:
requests:
storage: 30Gi
alertmanager:
alertmanagerSpec:
storage:
volumeClaimTemplate:
spec:
accessModes: [ReadWriteOnce]
resources:
requests:
storage: 2Gi
config:
global:
smtp_from: "alerts@DOMAIN_SUFFIX"
smtp_smarthost: "postfix.lasuite.svc.cluster.local:25"
smtp_require_tls: false
route:
group_by: [alertname, namespace]
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: email
receivers:
- name: email
email_configs:
- to: "ops@DOMAIN_SUFFIX"
send_resolved: true
# Disable monitors for components k3s doesn't expose
kubeEtcd:
enabled: false
kubeControllerManager:
enabled: false
kubeScheduler:
enabled: false
kubeProxy:
enabled: false

View File

@@ -0,0 +1,26 @@
# Tempo — monolithic single-binary, local filesystem backend.
# Receives OTLP over gRPC (:4317) and HTTP (:4318).
tempo:
reportingEnabled: false
receivers:
otlp:
protocols:
grpc:
endpoint: "0.0.0.0:4317"
http:
endpoint: "0.0.0.0:4318"
storage:
trace:
backend: local
local:
path: /var/tempo/traces
wal:
path: /var/tempo/wal
persistence:
enabled: true
size: 20Gi
# Expose OTLP ports as a ClusterIP service
service:
type: ClusterIP

View File

@@ -0,0 +1,36 @@
---
apiVersion: secrets.hashicorp.com/v1beta1
kind: VaultAuth
metadata:
name: vso-auth
namespace: monitoring
spec:
method: kubernetes
mount: kubernetes
kubernetes:
role: vso
serviceAccount: default
---
# Grafana admin password from OpenBao KV at secret/grafana.
apiVersion: secrets.hashicorp.com/v1beta1
kind: VaultStaticSecret
metadata:
name: grafana-admin
namespace: monitoring
spec:
vaultAuthRef: vso-auth
mount: secret
type: kv-v2
path: grafana
refreshAfter: 30s
destination:
name: grafana-admin
create: true
overwrite: true
transformation:
excludeRaw: true
templates:
admin-password:
text: "{{ index .Secrets \"admin-password\" }}"
admin-user:
text: "admin"

47
cloud-init.yaml Normal file
View File

@@ -0,0 +1,47 @@
#cloud-config
# Scaleway Elastic Metal — latest Debian
# Provisions: sienna user w/ GitHub SSH keys, k3s (traefik disabled)
users:
- name: sienna
groups: [sudo]
shell: /bin/bash
sudo: "ALL=(ALL) NOPASSWD:ALL"
ssh_import_id:
- gh:siennathesane
# Lock root and default debian user from password auth (SSH keys only)
disable_root: true
package_update: true
package_upgrade: true
packages:
- curl
- ca-certificates
- jq
# Write k3s config before the installer runs so traefik is never started
write_files:
- path: /etc/rancher/k3s/config.yaml
owner: root:root
permissions: "0644"
content: |
disable:
- traefik
runcmd:
# Install k3s (picks up /etc/rancher/k3s/config.yaml automatically)
- curl -sfL https://get.k3s.io | sh -
# Allow sienna to use kubectl without sudo
- mkdir -p /home/sienna/.kube
- cp /etc/rancher/k3s/k3s.yaml /home/sienna/.kube/config
- chown -R sienna:sienna /home/sienna/.kube
- chmod 600 /home/sienna/.kube/config
# Fix server address in kubeconfig (loopback is fine for local use)
- systemctl enable --now k3s
final_message: |
Sunbeam node ready. k3s installed, traefik disabled.
SSH: ssh sienna@<server-ip>
kubectl: KUBECONFIG=~/.kube/config kubectl get nodes

View File

@@ -1,18 +1,30 @@
# cert-manager resources for production TLS.
# cert-manager issuers and certificate for production TLS.
#
# Prerequisites:
# cert-manager must be installed in the cluster before applying this overlay:
# kubectl apply -f https://github.com/cert-manager/cert-manager/releases/latest/download/cert-manager.yaml
# WORKFLOW: start with letsencrypt-staging to verify the HTTP-01 challenge
# flow works without burning production rate limits. Once the staging cert
# is issued successfully, flip the Certificate issuerRef to letsencrypt-production
# and delete the old Secret so cert-manager re-issues with a trusted cert.
#
# DOMAIN_SUFFIX and ACME_EMAIL are substituted by sed at deploy time.
# See overlays/production/kustomization.yaml for the deploy command.
# ACME_EMAIL is substituted by sunbeam apply.
---
# ClusterIssuer: Let's Encrypt production via HTTP-01 challenge.
#
# cert-manager creates one Ingress per challenged domain. The pingora proxy
# watches these Ingresses and routes /.well-known/acme-challenge/<token>
# requests to the per-domain solver Service, so multi-SAN certificates are
# issued correctly even when all domain challenges run in parallel.
# Let's Encrypt staging — untrusted cert but no rate limits. Use for initial setup.
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-staging
spec:
acme:
server: https://acme-staging-v02.api.letsencrypt.org/directory
email: ACME_EMAIL
privateKeySecretRef:
name: letsencrypt-staging-account-key
solvers:
- http01:
ingress:
serviceType: ClusterIP
---
# Let's Encrypt production — trusted cert, strict rate limits.
# Switch to this once staging confirms challenges resolve correctly.
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
@@ -26,16 +38,11 @@ spec:
solvers:
- http01:
ingress:
# ingressClassName is intentionally blank: cert-manager still creates
# the Ingress object (which the proxy watches), but no ingress
# controller needs to act on it — the proxy handles routing itself.
ingressClassName: ""
serviceType: ClusterIP
---
# Certificate: single multi-SAN cert covering all proxy subdomains.
# cert-manager issues it via HTTP-01, stores it in pingora-tls Secret, and
# renews it automatically ~30 days before expiry. The watcher in sunbeam-proxy
# detects the Secret update and triggers a graceful upgrade so the new cert is
# loaded without dropping any connections.
# Certificate covering all proxy subdomains.
# Start with letsencrypt-staging. Once verified, change issuerRef.name to
# letsencrypt-production and delete the pingora-tls Secret to force re-issue.
apiVersion: cert-manager.io/v1
kind: Certificate
metadata:
@@ -56,3 +63,6 @@ spec:
- src.DOMAIN_SUFFIX
- auth.DOMAIN_SUFFIX
- s3.DOMAIN_SUFFIX
- grafana.DOMAIN_SUFFIX
- admin.DOMAIN_SUFFIX
- integration.DOMAIN_SUFFIX

View File

@@ -3,14 +3,12 @@ kind: Kustomization
# Production overlay — targets Scaleway Elastic Metal (Paris)
#
# Deploy (DOMAIN_SUFFIX and ACME_EMAIL are substituted by sed):
# DOMAIN="yourdomain.com" EMAIL="ops@yourdomain.com"
# kustomize build overlays/production/ \
# | sed -e "s/DOMAIN_SUFFIX/${DOMAIN}/g" -e "s/ACME_EMAIL/${EMAIL}/g" \
# | kubectl apply --server-side --force-conflicts -f -
# Deploy (DOMAIN_SUFFIX and ACME_EMAIL are substituted by sunbeam apply):
# sunbeam apply --env production --domain yourdomain.com
resources:
- ../../base/mesh
- ../../base/longhorn
- ../../base/cert-manager
- ../../base/ingress
- ../../base/ory
- ../../base/data
@@ -18,20 +16,42 @@ resources:
- ../../base/lasuite
- ../../base/media
- ../../base/devtools
- ../../base/vso
- ../../base/monitoring
# cert-manager ClusterIssuer + Certificate (requires cert-manager to be installed)
- cert-manager.yaml
# CNPG daily backup schedule
- postgres-scheduled-backup.yaml
images:
# Set to your container registry. DOMAIN_SUFFIX is substituted by sed.
- name: sunbeam-proxy
newName: src.DOMAIN_SUFFIX/sunbeam/sunbeam-proxy
# La Gaufre integration service — built and pushed by `sunbeam build integration`
- name: integration
newName: src.DOMAIN_SUFFIX/studio/integration
newTag: latest
# Meet — built from source and pushed to Gitea registry.
- name: meet-backend
newName: src.DOMAIN_SUFFIX/studio/meet-backend
newTag: latest
- name: meet-frontend
newName: src.DOMAIN_SUFFIX/studio/meet-frontend
newTag: latest
patches:
- path: values-pingora.yaml
# Pingora host ports — bind :80/:443 to the host network
- path: patch-pingora-hostport.yaml
# TODO: set OIDC redirect URIs to https://*.sunbeam.pt/...
# - path: values-ory.yaml
# Production resource limits for 64 GiB server
- path: values-resources.yaml
# TODO: set production resource limits (64 GB server)
# - path: values-resources.yaml
# LiveKit TURN service: ClusterIP (Pingora routes TURN traffic on :443)
- path: patch-livekit-service.yaml
# CNPG: production sizing (500 Gi, 8 Gi RAM) + barman S3 backup config
- path: patch-postgres-production.yaml
# OpenSearch: expand PVC to 50 Gi
- path: patch-opensearch-storage.yaml
# SeaweedFS volume: expand PVC to 600 Gi
- path: patch-seaweedfs-volume-size.yaml

View File

@@ -0,0 +1,10 @@
# Patch: keep LiveKit TURN service as ClusterIP — Pingora routes external TURN traffic.
# Without this patch, klipper-lb (disabled) or the default LoadBalancer type may
# conflict with Pingora's host port bindings on port 443.
apiVersion: v1
kind: Service
metadata:
name: livekit-server-turn
namespace: media
spec:
type: ClusterIP

View File

@@ -0,0 +1,10 @@
# Expand OpenSearch PVC to 50 Gi in production.
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: opensearch-data
namespace: data
spec:
resources:
requests:
storage: 50Gi

View File

@@ -0,0 +1,25 @@
# Bind Pingora container ports to the host network so external traffic
# on ports 80 and 443 reaches the proxy pod directly.
apiVersion: apps/v1
kind: Deployment
metadata:
name: pingora
namespace: ingress
spec:
template:
spec:
containers:
- name: pingora
ports:
- name: http
containerPort: 80
hostPort: 80
protocol: TCP
- name: https
containerPort: 443
hostPort: 443
protocol: TCP
- name: ssh
containerPort: 22
hostPort: 22
protocol: TCP

View File

@@ -0,0 +1,45 @@
# Production CNPG cluster sizing for 12-core, 64 GiB Elastic Metal.
# Barman backs up WAL + base backups to Scaleway Object Storage (s3://sunbeam-backups/postgres).
# barman-s3-creds K8s Secret is synced by VSO from secret/scaleway-s3 in OpenBao.
apiVersion: postgresql.cnpg.io/v1
kind: Cluster
metadata:
name: postgres
namespace: data
spec:
instances: 1
postgresql:
parameters:
max_connections: "200"
shared_buffers: "2GB"
effective_cache_size: "6GB"
work_mem: "16MB"
maintenance_work_mem: "512MB"
storage:
size: 100Gi
resources:
requests:
memory: 4Gi
cpu: "2"
limits:
memory: 8Gi
backup:
barmanObjectStore:
destinationPath: "s3://sunbeam-backups/postgres"
endpointURL: "https://s3.fr-par.scw.cloud"
s3Credentials:
accessKeyId:
name: barman-s3-creds
key: ACCESS_KEY_ID
secretAccessKey:
name: barman-s3-creds
key: ACCESS_SECRET_KEY
wal:
compression: gzip
data:
compression: gzip
retentionPolicy: "30d"

View File

@@ -0,0 +1,15 @@
# Expand SeaweedFS volume PVC to 600 Gi in production.
apiVersion: apps/v1
kind: StatefulSet
metadata:
name: seaweedfs-volume
namespace: storage
spec:
volumeClaimTemplates:
- metadata:
name: data
spec:
accessModes: [ReadWriteOnce]
resources:
requests:
storage: 600Gi

View File

@@ -0,0 +1,12 @@
apiVersion: postgresql.cnpg.io/v1
kind: ScheduledBackup
metadata:
name: postgres-daily
namespace: data
spec:
# Daily at 02:00 UTC
schedule: "0 2 * * *"
backupOwnerReference: self
cluster:
name: postgres
method: barmanObjectStore

View File

@@ -0,0 +1,293 @@
# Production resource limits — Scaleway Elastic Metal, 12 cores, 64 GiB RAM.
# ~10 GiB reserved for OS + k3s + Linkerd mesh overhead.
# Replicas scaled up for production workloads.
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: meet-celery-worker
namespace: lasuite
spec:
template:
spec:
containers:
- name: meet-celery-worker
resources:
requests:
memory: 256Mi
cpu: 100m
limits:
memory: 1Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: cloudnative-pg
namespace: data
spec:
template:
spec:
containers:
- name: manager
resources:
requests:
memory: 256Mi
cpu: 100m
limits:
memory: 512Mi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: livekit-server
namespace: media
spec:
template:
spec:
containers:
- name: livekit-server
resources:
requests:
memory: 512Mi
cpu: 500m
limits:
memory: 2Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: pingora
namespace: ingress
spec:
template:
spec:
containers:
- name: pingora
resources:
requests:
memory: 128Mi
cpu: 250m
limits:
memory: 512Mi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: valkey
namespace: data
spec:
template:
spec:
containers:
- name: valkey
resources:
requests:
memory: 128Mi
cpu: 50m
limits:
memory: 512Mi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: opensearch
namespace: data
spec:
template:
spec:
containers:
- name: opensearch
env:
- name: OPENSEARCH_JAVA_OPTS
value: "-Xms2g -Xmx4g"
resources:
requests:
memory: 2Gi
cpu: 500m
limits:
memory: 5Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: seaweedfs-filer
namespace: storage
spec:
template:
spec:
containers:
- name: filer
resources:
requests:
memory: 256Mi
cpu: 100m
limits:
memory: 1Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: hydra-hydra-maester
namespace: ory
spec:
template:
spec:
containers:
- name: hydra-maester
resources:
requests:
memory: 32Mi
cpu: 25m
limits:
memory: 128Mi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: login-ui
namespace: ory
spec:
template:
spec:
containers:
- name: login-ui
resources:
requests:
memory: 128Mi
cpu: 50m
limits:
memory: 384Mi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: hive
namespace: lasuite
spec:
template:
spec:
containers:
- name: hive
resources:
requests:
memory: 64Mi
limits:
memory: 256Mi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: people-backend
namespace: lasuite
spec:
replicas: 2
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: people-celery-worker
namespace: lasuite
spec:
replicas: 2
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: people-frontend
namespace: lasuite
spec:
replicas: 2
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: docs-celery-worker
namespace: lasuite
spec:
replicas: 2
template:
spec:
containers:
- name: docs
env:
- name: CELERY_WORKER_CONCURRENCY
value: "4"
resources:
requests:
memory: 512Mi
cpu: 250m
limits:
memory: 1Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: docs-backend
namespace: lasuite
spec:
replicas: 2
template:
spec:
containers:
- name: docs
env:
- name: WEB_CONCURRENCY
value: "4"
resources:
requests:
memory: 512Mi
cpu: 250m
limits:
memory: 1Gi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: docs-frontend
namespace: lasuite
spec:
replicas: 2
template:
spec:
containers:
- name: docs
resources:
requests:
memory: 64Mi
limits:
memory: 256Mi
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: docs-y-provider
namespace: lasuite
spec:
replicas: 1
template:
spec:
containers:
- name: docs
resources:
requests:
memory: 256Mi
cpu: 100m
limits:
memory: 1Gi