feat(infra): production bootstrap — cert-manager, longhorn, monitoring
Add new bases for cert-manager (Let's Encrypt + wildcard cert), Longhorn distributed storage, and monitoring (kube-prometheus-stack + Loki + Tempo + Grafana OIDC). Add cloud-init for Scaleway Elastic Metal provisioning. Production overlay: add patches for postgres sizing, SeaweedFS volume, OpenSearch storage, LiveKit service, Pingora host ports, resource limits, and CNPG daily barman backups. Update cert-manager.yaml with full dnsNames for all *.sunbeam.pt subdomains.
This commit is contained in:
@@ -1,18 +1,30 @@
|
||||
# cert-manager resources for production TLS.
|
||||
# cert-manager issuers and certificate for production TLS.
|
||||
#
|
||||
# Prerequisites:
|
||||
# cert-manager must be installed in the cluster before applying this overlay:
|
||||
# kubectl apply -f https://github.com/cert-manager/cert-manager/releases/latest/download/cert-manager.yaml
|
||||
# WORKFLOW: start with letsencrypt-staging to verify the HTTP-01 challenge
|
||||
# flow works without burning production rate limits. Once the staging cert
|
||||
# is issued successfully, flip the Certificate issuerRef to letsencrypt-production
|
||||
# and delete the old Secret so cert-manager re-issues with a trusted cert.
|
||||
#
|
||||
# DOMAIN_SUFFIX and ACME_EMAIL are substituted by sed at deploy time.
|
||||
# See overlays/production/kustomization.yaml for the deploy command.
|
||||
# ACME_EMAIL is substituted by sunbeam apply.
|
||||
---
|
||||
# ClusterIssuer: Let's Encrypt production via HTTP-01 challenge.
|
||||
#
|
||||
# cert-manager creates one Ingress per challenged domain. The pingora proxy
|
||||
# watches these Ingresses and routes /.well-known/acme-challenge/<token>
|
||||
# requests to the per-domain solver Service, so multi-SAN certificates are
|
||||
# issued correctly even when all domain challenges run in parallel.
|
||||
# Let's Encrypt staging — untrusted cert but no rate limits. Use for initial setup.
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: ClusterIssuer
|
||||
metadata:
|
||||
name: letsencrypt-staging
|
||||
spec:
|
||||
acme:
|
||||
server: https://acme-staging-v02.api.letsencrypt.org/directory
|
||||
email: ACME_EMAIL
|
||||
privateKeySecretRef:
|
||||
name: letsencrypt-staging-account-key
|
||||
solvers:
|
||||
- http01:
|
||||
ingress:
|
||||
serviceType: ClusterIP
|
||||
---
|
||||
# Let's Encrypt production — trusted cert, strict rate limits.
|
||||
# Switch to this once staging confirms challenges resolve correctly.
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: ClusterIssuer
|
||||
metadata:
|
||||
@@ -26,16 +38,11 @@ spec:
|
||||
solvers:
|
||||
- http01:
|
||||
ingress:
|
||||
# ingressClassName is intentionally blank: cert-manager still creates
|
||||
# the Ingress object (which the proxy watches), but no ingress
|
||||
# controller needs to act on it — the proxy handles routing itself.
|
||||
ingressClassName: ""
|
||||
serviceType: ClusterIP
|
||||
---
|
||||
# Certificate: single multi-SAN cert covering all proxy subdomains.
|
||||
# cert-manager issues it via HTTP-01, stores it in pingora-tls Secret, and
|
||||
# renews it automatically ~30 days before expiry. The watcher in sunbeam-proxy
|
||||
# detects the Secret update and triggers a graceful upgrade so the new cert is
|
||||
# loaded without dropping any connections.
|
||||
# Certificate covering all proxy subdomains.
|
||||
# Start with letsencrypt-staging. Once verified, change issuerRef.name to
|
||||
# letsencrypt-production and delete the pingora-tls Secret to force re-issue.
|
||||
apiVersion: cert-manager.io/v1
|
||||
kind: Certificate
|
||||
metadata:
|
||||
@@ -56,3 +63,6 @@ spec:
|
||||
- src.DOMAIN_SUFFIX
|
||||
- auth.DOMAIN_SUFFIX
|
||||
- s3.DOMAIN_SUFFIX
|
||||
- grafana.DOMAIN_SUFFIX
|
||||
- admin.DOMAIN_SUFFIX
|
||||
- integration.DOMAIN_SUFFIX
|
||||
|
||||
@@ -3,14 +3,12 @@ kind: Kustomization
|
||||
|
||||
# Production overlay — targets Scaleway Elastic Metal (Paris)
|
||||
#
|
||||
# Deploy (DOMAIN_SUFFIX and ACME_EMAIL are substituted by sed):
|
||||
# DOMAIN="yourdomain.com" EMAIL="ops@yourdomain.com"
|
||||
# kustomize build overlays/production/ \
|
||||
# | sed -e "s/DOMAIN_SUFFIX/${DOMAIN}/g" -e "s/ACME_EMAIL/${EMAIL}/g" \
|
||||
# | kubectl apply --server-side --force-conflicts -f -
|
||||
# Deploy (DOMAIN_SUFFIX and ACME_EMAIL are substituted by sunbeam apply):
|
||||
# sunbeam apply --env production --domain yourdomain.com
|
||||
|
||||
resources:
|
||||
- ../../base/mesh
|
||||
- ../../base/longhorn
|
||||
- ../../base/cert-manager
|
||||
- ../../base/ingress
|
||||
- ../../base/ory
|
||||
- ../../base/data
|
||||
@@ -18,20 +16,42 @@ resources:
|
||||
- ../../base/lasuite
|
||||
- ../../base/media
|
||||
- ../../base/devtools
|
||||
- ../../base/vso
|
||||
- ../../base/monitoring
|
||||
# cert-manager ClusterIssuer + Certificate (requires cert-manager to be installed)
|
||||
- cert-manager.yaml
|
||||
# CNPG daily backup schedule
|
||||
- postgres-scheduled-backup.yaml
|
||||
|
||||
images:
|
||||
# Set to your container registry. DOMAIN_SUFFIX is substituted by sed.
|
||||
- name: sunbeam-proxy
|
||||
newName: src.DOMAIN_SUFFIX/sunbeam/sunbeam-proxy
|
||||
# La Gaufre integration service — built and pushed by `sunbeam build integration`
|
||||
- name: integration
|
||||
newName: src.DOMAIN_SUFFIX/studio/integration
|
||||
newTag: latest
|
||||
|
||||
# Meet — built from source and pushed to Gitea registry.
|
||||
- name: meet-backend
|
||||
newName: src.DOMAIN_SUFFIX/studio/meet-backend
|
||||
newTag: latest
|
||||
- name: meet-frontend
|
||||
newName: src.DOMAIN_SUFFIX/studio/meet-frontend
|
||||
newTag: latest
|
||||
|
||||
patches:
|
||||
- path: values-pingora.yaml
|
||||
# Pingora host ports — bind :80/:443 to the host network
|
||||
- path: patch-pingora-hostport.yaml
|
||||
|
||||
# TODO: set OIDC redirect URIs to https://*.sunbeam.pt/...
|
||||
# - path: values-ory.yaml
|
||||
# Production resource limits for 64 GiB server
|
||||
- path: values-resources.yaml
|
||||
|
||||
# TODO: set production resource limits (64 GB server)
|
||||
# - path: values-resources.yaml
|
||||
# LiveKit TURN service: ClusterIP (Pingora routes TURN traffic on :443)
|
||||
- path: patch-livekit-service.yaml
|
||||
|
||||
# CNPG: production sizing (500 Gi, 8 Gi RAM) + barman S3 backup config
|
||||
- path: patch-postgres-production.yaml
|
||||
|
||||
# OpenSearch: expand PVC to 50 Gi
|
||||
- path: patch-opensearch-storage.yaml
|
||||
|
||||
# SeaweedFS volume: expand PVC to 600 Gi
|
||||
- path: patch-seaweedfs-volume-size.yaml
|
||||
|
||||
10
overlays/production/patch-livekit-service.yaml
Normal file
10
overlays/production/patch-livekit-service.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
# Patch: keep LiveKit TURN service as ClusterIP — Pingora routes external TURN traffic.
|
||||
# Without this patch, klipper-lb (disabled) or the default LoadBalancer type may
|
||||
# conflict with Pingora's host port bindings on port 443.
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: livekit-server-turn
|
||||
namespace: media
|
||||
spec:
|
||||
type: ClusterIP
|
||||
10
overlays/production/patch-opensearch-storage.yaml
Normal file
10
overlays/production/patch-opensearch-storage.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
# Expand OpenSearch PVC to 50 Gi in production.
|
||||
apiVersion: v1
|
||||
kind: PersistentVolumeClaim
|
||||
metadata:
|
||||
name: opensearch-data
|
||||
namespace: data
|
||||
spec:
|
||||
resources:
|
||||
requests:
|
||||
storage: 50Gi
|
||||
25
overlays/production/patch-pingora-hostport.yaml
Normal file
25
overlays/production/patch-pingora-hostport.yaml
Normal file
@@ -0,0 +1,25 @@
|
||||
# Bind Pingora container ports to the host network so external traffic
|
||||
# on ports 80 and 443 reaches the proxy pod directly.
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: pingora
|
||||
namespace: ingress
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: pingora
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 80
|
||||
hostPort: 80
|
||||
protocol: TCP
|
||||
- name: https
|
||||
containerPort: 443
|
||||
hostPort: 443
|
||||
protocol: TCP
|
||||
- name: ssh
|
||||
containerPort: 22
|
||||
hostPort: 22
|
||||
protocol: TCP
|
||||
45
overlays/production/patch-postgres-production.yaml
Normal file
45
overlays/production/patch-postgres-production.yaml
Normal file
@@ -0,0 +1,45 @@
|
||||
# Production CNPG cluster sizing for 12-core, 64 GiB Elastic Metal.
|
||||
# Barman backs up WAL + base backups to Scaleway Object Storage (s3://sunbeam-backups/postgres).
|
||||
# barman-s3-creds K8s Secret is synced by VSO from secret/scaleway-s3 in OpenBao.
|
||||
apiVersion: postgresql.cnpg.io/v1
|
||||
kind: Cluster
|
||||
metadata:
|
||||
name: postgres
|
||||
namespace: data
|
||||
spec:
|
||||
instances: 1
|
||||
|
||||
postgresql:
|
||||
parameters:
|
||||
max_connections: "200"
|
||||
shared_buffers: "2GB"
|
||||
effective_cache_size: "6GB"
|
||||
work_mem: "16MB"
|
||||
maintenance_work_mem: "512MB"
|
||||
|
||||
storage:
|
||||
size: 100Gi
|
||||
|
||||
resources:
|
||||
requests:
|
||||
memory: 4Gi
|
||||
cpu: "2"
|
||||
limits:
|
||||
memory: 8Gi
|
||||
|
||||
backup:
|
||||
barmanObjectStore:
|
||||
destinationPath: "s3://sunbeam-backups/postgres"
|
||||
endpointURL: "https://s3.fr-par.scw.cloud"
|
||||
s3Credentials:
|
||||
accessKeyId:
|
||||
name: barman-s3-creds
|
||||
key: ACCESS_KEY_ID
|
||||
secretAccessKey:
|
||||
name: barman-s3-creds
|
||||
key: ACCESS_SECRET_KEY
|
||||
wal:
|
||||
compression: gzip
|
||||
data:
|
||||
compression: gzip
|
||||
retentionPolicy: "30d"
|
||||
15
overlays/production/patch-seaweedfs-volume-size.yaml
Normal file
15
overlays/production/patch-seaweedfs-volume-size.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
# Expand SeaweedFS volume PVC to 600 Gi in production.
|
||||
apiVersion: apps/v1
|
||||
kind: StatefulSet
|
||||
metadata:
|
||||
name: seaweedfs-volume
|
||||
namespace: storage
|
||||
spec:
|
||||
volumeClaimTemplates:
|
||||
- metadata:
|
||||
name: data
|
||||
spec:
|
||||
accessModes: [ReadWriteOnce]
|
||||
resources:
|
||||
requests:
|
||||
storage: 600Gi
|
||||
12
overlays/production/postgres-scheduled-backup.yaml
Normal file
12
overlays/production/postgres-scheduled-backup.yaml
Normal file
@@ -0,0 +1,12 @@
|
||||
apiVersion: postgresql.cnpg.io/v1
|
||||
kind: ScheduledBackup
|
||||
metadata:
|
||||
name: postgres-daily
|
||||
namespace: data
|
||||
spec:
|
||||
# Daily at 02:00 UTC
|
||||
schedule: "0 2 * * *"
|
||||
backupOwnerReference: self
|
||||
cluster:
|
||||
name: postgres
|
||||
method: barmanObjectStore
|
||||
293
overlays/production/values-resources.yaml
Normal file
293
overlays/production/values-resources.yaml
Normal file
@@ -0,0 +1,293 @@
|
||||
# Production resource limits — Scaleway Elastic Metal, 12 cores, 64 GiB RAM.
|
||||
# ~10 GiB reserved for OS + k3s + Linkerd mesh overhead.
|
||||
# Replicas scaled up for production workloads.
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: meet-celery-worker
|
||||
namespace: lasuite
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: meet-celery-worker
|
||||
resources:
|
||||
requests:
|
||||
memory: 256Mi
|
||||
cpu: 100m
|
||||
limits:
|
||||
memory: 1Gi
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: cloudnative-pg
|
||||
namespace: data
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: manager
|
||||
resources:
|
||||
requests:
|
||||
memory: 256Mi
|
||||
cpu: 100m
|
||||
limits:
|
||||
memory: 512Mi
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: livekit-server
|
||||
namespace: media
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: livekit-server
|
||||
resources:
|
||||
requests:
|
||||
memory: 512Mi
|
||||
cpu: 500m
|
||||
limits:
|
||||
memory: 2Gi
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: pingora
|
||||
namespace: ingress
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: pingora
|
||||
resources:
|
||||
requests:
|
||||
memory: 128Mi
|
||||
cpu: 250m
|
||||
limits:
|
||||
memory: 512Mi
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: valkey
|
||||
namespace: data
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: valkey
|
||||
resources:
|
||||
requests:
|
||||
memory: 128Mi
|
||||
cpu: 50m
|
||||
limits:
|
||||
memory: 512Mi
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: opensearch
|
||||
namespace: data
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: opensearch
|
||||
env:
|
||||
- name: OPENSEARCH_JAVA_OPTS
|
||||
value: "-Xms2g -Xmx4g"
|
||||
resources:
|
||||
requests:
|
||||
memory: 2Gi
|
||||
cpu: 500m
|
||||
limits:
|
||||
memory: 5Gi
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: seaweedfs-filer
|
||||
namespace: storage
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: filer
|
||||
resources:
|
||||
requests:
|
||||
memory: 256Mi
|
||||
cpu: 100m
|
||||
limits:
|
||||
memory: 1Gi
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: hydra-hydra-maester
|
||||
namespace: ory
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: hydra-maester
|
||||
resources:
|
||||
requests:
|
||||
memory: 32Mi
|
||||
cpu: 25m
|
||||
limits:
|
||||
memory: 128Mi
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: login-ui
|
||||
namespace: ory
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: login-ui
|
||||
resources:
|
||||
requests:
|
||||
memory: 128Mi
|
||||
cpu: 50m
|
||||
limits:
|
||||
memory: 384Mi
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: hive
|
||||
namespace: lasuite
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: hive
|
||||
resources:
|
||||
requests:
|
||||
memory: 64Mi
|
||||
limits:
|
||||
memory: 256Mi
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: people-backend
|
||||
namespace: lasuite
|
||||
spec:
|
||||
replicas: 2
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: people-celery-worker
|
||||
namespace: lasuite
|
||||
spec:
|
||||
replicas: 2
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: people-frontend
|
||||
namespace: lasuite
|
||||
spec:
|
||||
replicas: 2
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: docs-celery-worker
|
||||
namespace: lasuite
|
||||
spec:
|
||||
replicas: 2
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: docs
|
||||
env:
|
||||
- name: CELERY_WORKER_CONCURRENCY
|
||||
value: "4"
|
||||
resources:
|
||||
requests:
|
||||
memory: 512Mi
|
||||
cpu: 250m
|
||||
limits:
|
||||
memory: 1Gi
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: docs-backend
|
||||
namespace: lasuite
|
||||
spec:
|
||||
replicas: 2
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: docs
|
||||
env:
|
||||
- name: WEB_CONCURRENCY
|
||||
value: "4"
|
||||
resources:
|
||||
requests:
|
||||
memory: 512Mi
|
||||
cpu: 250m
|
||||
limits:
|
||||
memory: 1Gi
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: docs-frontend
|
||||
namespace: lasuite
|
||||
spec:
|
||||
replicas: 2
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: docs
|
||||
resources:
|
||||
requests:
|
||||
memory: 64Mi
|
||||
limits:
|
||||
memory: 256Mi
|
||||
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: docs-y-provider
|
||||
namespace: lasuite
|
||||
spec:
|
||||
replicas: 1
|
||||
template:
|
||||
spec:
|
||||
containers:
|
||||
- name: docs
|
||||
resources:
|
||||
requests:
|
||||
memory: 256Mi
|
||||
cpu: 100m
|
||||
limits:
|
||||
memory: 1Gi
|
||||
Reference in New Issue
Block a user