feat(infra): production bootstrap — cert-manager, longhorn, monitoring
Add new bases for cert-manager (Let's Encrypt + wildcard cert), Longhorn distributed storage, and monitoring (kube-prometheus-stack + Loki + Tempo + Grafana OIDC). Add cloud-init for Scaleway Elastic Metal provisioning. Production overlay: add patches for postgres sizing, SeaweedFS volume, OpenSearch storage, LiveKit service, Pingora host ports, resource limits, and CNPG daily barman backups. Update cert-manager.yaml with full dnsNames for all *.sunbeam.pt subdomains.
This commit is contained in:
15
base/cert-manager/kustomization.yaml
Normal file
15
base/cert-manager/kustomization.yaml
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
|
||||||
|
resources:
|
||||||
|
- namespace.yaml
|
||||||
|
|
||||||
|
helmCharts:
|
||||||
|
# helm repo add jetstack https://charts.jetstack.io
|
||||||
|
- name: cert-manager
|
||||||
|
repo: https://charts.jetstack.io
|
||||||
|
version: "1.19.4"
|
||||||
|
releaseName: cert-manager
|
||||||
|
namespace: cert-manager
|
||||||
|
valuesFile: values.yaml
|
||||||
|
includeCRDs: true
|
||||||
4
base/cert-manager/namespace.yaml
Normal file
4
base/cert-manager/namespace.yaml
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: cert-manager
|
||||||
2
base/cert-manager/values.yaml
Normal file
2
base/cert-manager/values.yaml
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
crds:
|
||||||
|
enabled: true
|
||||||
13
base/longhorn/kustomization.yaml
Normal file
13
base/longhorn/kustomization.yaml
Normal file
@@ -0,0 +1,13 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
|
||||||
|
resources:
|
||||||
|
- namespace.yaml
|
||||||
|
|
||||||
|
helmCharts:
|
||||||
|
- name: longhorn
|
||||||
|
repo: https://charts.longhorn.io
|
||||||
|
version: "1.11.0"
|
||||||
|
releaseName: longhorn
|
||||||
|
namespace: longhorn-system
|
||||||
|
valuesFile: values.yaml
|
||||||
4
base/longhorn/namespace.yaml
Normal file
4
base/longhorn/namespace.yaml
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: longhorn-system
|
||||||
24
base/longhorn/values.yaml
Normal file
24
base/longhorn/values.yaml
Normal file
@@ -0,0 +1,24 @@
|
|||||||
|
# Longhorn distributed block storage
|
||||||
|
# Single-node production deployment — RAID1 hardware already provides redundancy.
|
||||||
|
|
||||||
|
defaultSettings:
|
||||||
|
# 1 replica: RAID1 mirrors the disk, so software replication adds no benefit
|
||||||
|
# on a single node and would halve available capacity.
|
||||||
|
defaultReplicaCount: 1
|
||||||
|
|
||||||
|
# Orphan auto-deletion: clean up node/instance orphaned resources automatically
|
||||||
|
orphanAutoDeletion: true
|
||||||
|
|
||||||
|
# Allow volumes to be scheduled on the only available node even when disk
|
||||||
|
# pressure is detected (single-node: no other node to reschedule to).
|
||||||
|
allowVolumeCreationWithDegradedAvailability: true
|
||||||
|
|
||||||
|
# Reduce reserved percentage to 10% — RAID1 hardware provides physical redundancy,
|
||||||
|
# so Longhorn doesn't need to hold back 30% for software replicas.
|
||||||
|
# With 937 GiB disk: 843 GiB schedulable (600 SW + 100 PG + ~143 headroom).
|
||||||
|
storageReservedPercentageForDefaultDisk: 10
|
||||||
|
|
||||||
|
# Set Longhorn as the default StorageClass.
|
||||||
|
persistence:
|
||||||
|
defaultClass: true
|
||||||
|
defaultClassReplicaCount: 1
|
||||||
32
base/monitoring/grafana-oauth2client.yaml
Normal file
32
base/monitoring/grafana-oauth2client.yaml
Normal file
@@ -0,0 +1,32 @@
|
|||||||
|
# Hydra OAuth2Client for Grafana OIDC sign-in.
|
||||||
|
#
|
||||||
|
# Hydra Maester watches this CRD and:
|
||||||
|
# 1. Registers the client with Hydra
|
||||||
|
# 2. Creates K8s Secret "grafana-oidc" in monitoring namespace
|
||||||
|
# with CLIENT_ID and CLIENT_SECRET keys.
|
||||||
|
#
|
||||||
|
# Grafana picks up the secret via envFromSecret and interpolates
|
||||||
|
# ${CLIENT_ID} / ${CLIENT_SECRET} in grafana.ini at startup.
|
||||||
|
#
|
||||||
|
# DOMAIN_SUFFIX is substituted by sunbeam apply.
|
||||||
|
---
|
||||||
|
apiVersion: hydra.ory.sh/v1alpha1
|
||||||
|
kind: OAuth2Client
|
||||||
|
metadata:
|
||||||
|
name: grafana
|
||||||
|
namespace: monitoring
|
||||||
|
spec:
|
||||||
|
clientName: Grafana
|
||||||
|
grantTypes:
|
||||||
|
- authorization_code
|
||||||
|
- refresh_token
|
||||||
|
responseTypes:
|
||||||
|
- code
|
||||||
|
scope: openid email profile
|
||||||
|
redirectUris:
|
||||||
|
- https://grafana.DOMAIN_SUFFIX/login/generic_oauth
|
||||||
|
postLogoutRedirectUris:
|
||||||
|
- https://grafana.DOMAIN_SUFFIX/
|
||||||
|
tokenEndpointAuthMethod: client_secret_post
|
||||||
|
secretName: grafana-oidc
|
||||||
|
skipConsent: true
|
||||||
34
base/monitoring/kustomization.yaml
Normal file
34
base/monitoring/kustomization.yaml
Normal file
@@ -0,0 +1,34 @@
|
|||||||
|
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||||
|
kind: Kustomization
|
||||||
|
|
||||||
|
namespace: monitoring
|
||||||
|
|
||||||
|
resources:
|
||||||
|
- namespace.yaml
|
||||||
|
- vault-secrets.yaml
|
||||||
|
- grafana-oauth2client.yaml
|
||||||
|
|
||||||
|
helmCharts:
|
||||||
|
# helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
|
||||||
|
- name: kube-prometheus-stack
|
||||||
|
repo: https://prometheus-community.github.io/helm-charts
|
||||||
|
version: "82.9.0"
|
||||||
|
releaseName: kube-prometheus-stack
|
||||||
|
namespace: monitoring
|
||||||
|
valuesFile: prometheus-values.yaml
|
||||||
|
includeCRDs: true
|
||||||
|
|
||||||
|
# helm repo add grafana https://grafana.github.io/helm-charts
|
||||||
|
- name: loki
|
||||||
|
repo: https://grafana.github.io/helm-charts
|
||||||
|
version: "6.53.0"
|
||||||
|
releaseName: loki
|
||||||
|
namespace: monitoring
|
||||||
|
valuesFile: loki-values.yaml
|
||||||
|
|
||||||
|
- name: tempo
|
||||||
|
repo: https://grafana.github.io/helm-charts
|
||||||
|
version: "1.24.4"
|
||||||
|
releaseName: tempo
|
||||||
|
namespace: monitoring
|
||||||
|
valuesFile: tempo-values.yaml
|
||||||
43
base/monitoring/loki-values.yaml
Normal file
43
base/monitoring/loki-values.yaml
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
# Loki — monolithic single-binary mode, filesystem storage, single tenant.
|
||||||
|
deploymentMode: SingleBinary
|
||||||
|
|
||||||
|
loki:
|
||||||
|
auth_enabled: false
|
||||||
|
commonConfig:
|
||||||
|
replication_factor: 1
|
||||||
|
storage:
|
||||||
|
type: filesystem
|
||||||
|
schemaConfig:
|
||||||
|
configs:
|
||||||
|
- from: "2024-01-01"
|
||||||
|
store: tsdb
|
||||||
|
object_store: filesystem
|
||||||
|
schema: v13
|
||||||
|
index:
|
||||||
|
prefix: index_
|
||||||
|
period: 24h
|
||||||
|
|
||||||
|
singleBinary:
|
||||||
|
replicas: 1
|
||||||
|
persistence:
|
||||||
|
enabled: true
|
||||||
|
size: 30Gi
|
||||||
|
|
||||||
|
# Disable sub-charts/probes not needed for single-node
|
||||||
|
backend:
|
||||||
|
replicas: 0
|
||||||
|
read:
|
||||||
|
replicas: 0
|
||||||
|
write:
|
||||||
|
replicas: 0
|
||||||
|
|
||||||
|
monitoring:
|
||||||
|
selfMonitoring:
|
||||||
|
enabled: false
|
||||||
|
grafanaAgent:
|
||||||
|
installOperator: false
|
||||||
|
lokiCanary:
|
||||||
|
enabled: false
|
||||||
|
|
||||||
|
test:
|
||||||
|
enabled: false
|
||||||
4
base/monitoring/namespace.yaml
Normal file
4
base/monitoring/namespace.yaml
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
apiVersion: v1
|
||||||
|
kind: Namespace
|
||||||
|
metadata:
|
||||||
|
name: monitoring
|
||||||
96
base/monitoring/prometheus-values.yaml
Normal file
96
base/monitoring/prometheus-values.yaml
Normal file
@@ -0,0 +1,96 @@
|
|||||||
|
# kube-prometheus-stack — Prometheus + AlertManager + Grafana + node-exporter + kube-state-metrics
|
||||||
|
#
|
||||||
|
# k3s quirks: kube-proxy is replaced by Cilium; etcd/scheduler/controller-manager
|
||||||
|
# don't expose metrics on standard ports. Disable their monitors to avoid noise.
|
||||||
|
|
||||||
|
grafana:
|
||||||
|
adminUser: admin
|
||||||
|
admin:
|
||||||
|
existingSecret: grafana-admin
|
||||||
|
passwordKey: admin-password
|
||||||
|
persistence:
|
||||||
|
enabled: true
|
||||||
|
size: 2Gi
|
||||||
|
# Inject Hydra OIDC client credentials (created by Hydra Maester from the OAuth2Client CRD)
|
||||||
|
envFromSecret: grafana-oidc
|
||||||
|
grafana.ini:
|
||||||
|
server:
|
||||||
|
root_url: "https://grafana.DOMAIN_SUFFIX"
|
||||||
|
auth:
|
||||||
|
# Keep local login as fallback (admin password from grafana-admin secret)
|
||||||
|
disable_login_form: false
|
||||||
|
signout_redirect_url: "https://auth.DOMAIN_SUFFIX/oauth2/sessions/logout"
|
||||||
|
auth.generic_oauth:
|
||||||
|
enabled: true
|
||||||
|
name: Sunbeam
|
||||||
|
icon: signin
|
||||||
|
# CLIENT_ID / CLIENT_SECRET injected from grafana-oidc K8s Secret via envFromSecret
|
||||||
|
client_id: "${CLIENT_ID}"
|
||||||
|
client_secret: "${CLIENT_SECRET}"
|
||||||
|
scopes: "openid email profile"
|
||||||
|
auth_url: "https://auth.DOMAIN_SUFFIX/oauth2/auth"
|
||||||
|
token_url: "https://auth.DOMAIN_SUFFIX/oauth2/token"
|
||||||
|
api_url: "https://auth.DOMAIN_SUFFIX/userinfo"
|
||||||
|
allow_sign_up: true
|
||||||
|
# Small studio — anyone with a valid La Suite account is an admin.
|
||||||
|
# To restrict to specific users, set role_attribute_path instead.
|
||||||
|
auto_assign_org_role: Admin
|
||||||
|
skip_org_role_sync: true
|
||||||
|
additionalDataSources:
|
||||||
|
- name: Loki
|
||||||
|
type: loki
|
||||||
|
url: http://loki.monitoring.svc.cluster.local:3100
|
||||||
|
access: proxy
|
||||||
|
isDefault: false
|
||||||
|
- name: Tempo
|
||||||
|
type: tempo
|
||||||
|
url: http://tempo.monitoring.svc.cluster.local:3100
|
||||||
|
access: proxy
|
||||||
|
isDefault: false
|
||||||
|
|
||||||
|
prometheus:
|
||||||
|
prometheusSpec:
|
||||||
|
retention: 90d
|
||||||
|
storageSpec:
|
||||||
|
volumeClaimTemplate:
|
||||||
|
spec:
|
||||||
|
accessModes: [ReadWriteOnce]
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 30Gi
|
||||||
|
|
||||||
|
alertmanager:
|
||||||
|
alertmanagerSpec:
|
||||||
|
storage:
|
||||||
|
volumeClaimTemplate:
|
||||||
|
spec:
|
||||||
|
accessModes: [ReadWriteOnce]
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 2Gi
|
||||||
|
config:
|
||||||
|
global:
|
||||||
|
smtp_from: "alerts@DOMAIN_SUFFIX"
|
||||||
|
smtp_smarthost: "postfix.lasuite.svc.cluster.local:25"
|
||||||
|
smtp_require_tls: false
|
||||||
|
route:
|
||||||
|
group_by: [alertname, namespace]
|
||||||
|
group_wait: 30s
|
||||||
|
group_interval: 5m
|
||||||
|
repeat_interval: 12h
|
||||||
|
receiver: email
|
||||||
|
receivers:
|
||||||
|
- name: email
|
||||||
|
email_configs:
|
||||||
|
- to: "ops@DOMAIN_SUFFIX"
|
||||||
|
send_resolved: true
|
||||||
|
|
||||||
|
# Disable monitors for components k3s doesn't expose
|
||||||
|
kubeEtcd:
|
||||||
|
enabled: false
|
||||||
|
kubeControllerManager:
|
||||||
|
enabled: false
|
||||||
|
kubeScheduler:
|
||||||
|
enabled: false
|
||||||
|
kubeProxy:
|
||||||
|
enabled: false
|
||||||
26
base/monitoring/tempo-values.yaml
Normal file
26
base/monitoring/tempo-values.yaml
Normal file
@@ -0,0 +1,26 @@
|
|||||||
|
# Tempo — monolithic single-binary, local filesystem backend.
|
||||||
|
# Receives OTLP over gRPC (:4317) and HTTP (:4318).
|
||||||
|
tempo:
|
||||||
|
reportingEnabled: false
|
||||||
|
receivers:
|
||||||
|
otlp:
|
||||||
|
protocols:
|
||||||
|
grpc:
|
||||||
|
endpoint: "0.0.0.0:4317"
|
||||||
|
http:
|
||||||
|
endpoint: "0.0.0.0:4318"
|
||||||
|
storage:
|
||||||
|
trace:
|
||||||
|
backend: local
|
||||||
|
local:
|
||||||
|
path: /var/tempo/traces
|
||||||
|
wal:
|
||||||
|
path: /var/tempo/wal
|
||||||
|
|
||||||
|
persistence:
|
||||||
|
enabled: true
|
||||||
|
size: 20Gi
|
||||||
|
|
||||||
|
# Expose OTLP ports as a ClusterIP service
|
||||||
|
service:
|
||||||
|
type: ClusterIP
|
||||||
36
base/monitoring/vault-secrets.yaml
Normal file
36
base/monitoring/vault-secrets.yaml
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
---
|
||||||
|
apiVersion: secrets.hashicorp.com/v1beta1
|
||||||
|
kind: VaultAuth
|
||||||
|
metadata:
|
||||||
|
name: vso-auth
|
||||||
|
namespace: monitoring
|
||||||
|
spec:
|
||||||
|
method: kubernetes
|
||||||
|
mount: kubernetes
|
||||||
|
kubernetes:
|
||||||
|
role: vso
|
||||||
|
serviceAccount: default
|
||||||
|
---
|
||||||
|
# Grafana admin password from OpenBao KV at secret/grafana.
|
||||||
|
apiVersion: secrets.hashicorp.com/v1beta1
|
||||||
|
kind: VaultStaticSecret
|
||||||
|
metadata:
|
||||||
|
name: grafana-admin
|
||||||
|
namespace: monitoring
|
||||||
|
spec:
|
||||||
|
vaultAuthRef: vso-auth
|
||||||
|
mount: secret
|
||||||
|
type: kv-v2
|
||||||
|
path: grafana
|
||||||
|
refreshAfter: 30s
|
||||||
|
destination:
|
||||||
|
name: grafana-admin
|
||||||
|
create: true
|
||||||
|
overwrite: true
|
||||||
|
transformation:
|
||||||
|
excludeRaw: true
|
||||||
|
templates:
|
||||||
|
admin-password:
|
||||||
|
text: "{{ index .Secrets \"admin-password\" }}"
|
||||||
|
admin-user:
|
||||||
|
text: "admin"
|
||||||
47
cloud-init.yaml
Normal file
47
cloud-init.yaml
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
#cloud-config
|
||||||
|
# Scaleway Elastic Metal — latest Debian
|
||||||
|
# Provisions: sienna user w/ GitHub SSH keys, k3s (traefik disabled)
|
||||||
|
|
||||||
|
users:
|
||||||
|
- name: sienna
|
||||||
|
groups: [sudo]
|
||||||
|
shell: /bin/bash
|
||||||
|
sudo: "ALL=(ALL) NOPASSWD:ALL"
|
||||||
|
ssh_import_id:
|
||||||
|
- gh:siennathesane
|
||||||
|
|
||||||
|
# Lock root and default debian user from password auth (SSH keys only)
|
||||||
|
disable_root: true
|
||||||
|
|
||||||
|
package_update: true
|
||||||
|
package_upgrade: true
|
||||||
|
|
||||||
|
packages:
|
||||||
|
- curl
|
||||||
|
- ca-certificates
|
||||||
|
- jq
|
||||||
|
|
||||||
|
# Write k3s config before the installer runs so traefik is never started
|
||||||
|
write_files:
|
||||||
|
- path: /etc/rancher/k3s/config.yaml
|
||||||
|
owner: root:root
|
||||||
|
permissions: "0644"
|
||||||
|
content: |
|
||||||
|
disable:
|
||||||
|
- traefik
|
||||||
|
|
||||||
|
runcmd:
|
||||||
|
# Install k3s (picks up /etc/rancher/k3s/config.yaml automatically)
|
||||||
|
- curl -sfL https://get.k3s.io | sh -
|
||||||
|
# Allow sienna to use kubectl without sudo
|
||||||
|
- mkdir -p /home/sienna/.kube
|
||||||
|
- cp /etc/rancher/k3s/k3s.yaml /home/sienna/.kube/config
|
||||||
|
- chown -R sienna:sienna /home/sienna/.kube
|
||||||
|
- chmod 600 /home/sienna/.kube/config
|
||||||
|
# Fix server address in kubeconfig (loopback is fine for local use)
|
||||||
|
- systemctl enable --now k3s
|
||||||
|
|
||||||
|
final_message: |
|
||||||
|
Sunbeam node ready. k3s installed, traefik disabled.
|
||||||
|
SSH: ssh sienna@<server-ip>
|
||||||
|
kubectl: KUBECONFIG=~/.kube/config kubectl get nodes
|
||||||
@@ -1,18 +1,30 @@
|
|||||||
# cert-manager resources for production TLS.
|
# cert-manager issuers and certificate for production TLS.
|
||||||
#
|
#
|
||||||
# Prerequisites:
|
# WORKFLOW: start with letsencrypt-staging to verify the HTTP-01 challenge
|
||||||
# cert-manager must be installed in the cluster before applying this overlay:
|
# flow works without burning production rate limits. Once the staging cert
|
||||||
# kubectl apply -f https://github.com/cert-manager/cert-manager/releases/latest/download/cert-manager.yaml
|
# is issued successfully, flip the Certificate issuerRef to letsencrypt-production
|
||||||
|
# and delete the old Secret so cert-manager re-issues with a trusted cert.
|
||||||
#
|
#
|
||||||
# DOMAIN_SUFFIX and ACME_EMAIL are substituted by sed at deploy time.
|
# ACME_EMAIL is substituted by sunbeam apply.
|
||||||
# See overlays/production/kustomization.yaml for the deploy command.
|
|
||||||
---
|
---
|
||||||
# ClusterIssuer: Let's Encrypt production via HTTP-01 challenge.
|
# Let's Encrypt staging — untrusted cert but no rate limits. Use for initial setup.
|
||||||
#
|
apiVersion: cert-manager.io/v1
|
||||||
# cert-manager creates one Ingress per challenged domain. The pingora proxy
|
kind: ClusterIssuer
|
||||||
# watches these Ingresses and routes /.well-known/acme-challenge/<token>
|
metadata:
|
||||||
# requests to the per-domain solver Service, so multi-SAN certificates are
|
name: letsencrypt-staging
|
||||||
# issued correctly even when all domain challenges run in parallel.
|
spec:
|
||||||
|
acme:
|
||||||
|
server: https://acme-staging-v02.api.letsencrypt.org/directory
|
||||||
|
email: ACME_EMAIL
|
||||||
|
privateKeySecretRef:
|
||||||
|
name: letsencrypt-staging-account-key
|
||||||
|
solvers:
|
||||||
|
- http01:
|
||||||
|
ingress:
|
||||||
|
serviceType: ClusterIP
|
||||||
|
---
|
||||||
|
# Let's Encrypt production — trusted cert, strict rate limits.
|
||||||
|
# Switch to this once staging confirms challenges resolve correctly.
|
||||||
apiVersion: cert-manager.io/v1
|
apiVersion: cert-manager.io/v1
|
||||||
kind: ClusterIssuer
|
kind: ClusterIssuer
|
||||||
metadata:
|
metadata:
|
||||||
@@ -26,16 +38,11 @@ spec:
|
|||||||
solvers:
|
solvers:
|
||||||
- http01:
|
- http01:
|
||||||
ingress:
|
ingress:
|
||||||
# ingressClassName is intentionally blank: cert-manager still creates
|
serviceType: ClusterIP
|
||||||
# the Ingress object (which the proxy watches), but no ingress
|
|
||||||
# controller needs to act on it — the proxy handles routing itself.
|
|
||||||
ingressClassName: ""
|
|
||||||
---
|
---
|
||||||
# Certificate: single multi-SAN cert covering all proxy subdomains.
|
# Certificate covering all proxy subdomains.
|
||||||
# cert-manager issues it via HTTP-01, stores it in pingora-tls Secret, and
|
# Start with letsencrypt-staging. Once verified, change issuerRef.name to
|
||||||
# renews it automatically ~30 days before expiry. The watcher in sunbeam-proxy
|
# letsencrypt-production and delete the pingora-tls Secret to force re-issue.
|
||||||
# detects the Secret update and triggers a graceful upgrade so the new cert is
|
|
||||||
# loaded without dropping any connections.
|
|
||||||
apiVersion: cert-manager.io/v1
|
apiVersion: cert-manager.io/v1
|
||||||
kind: Certificate
|
kind: Certificate
|
||||||
metadata:
|
metadata:
|
||||||
@@ -56,3 +63,6 @@ spec:
|
|||||||
- src.DOMAIN_SUFFIX
|
- src.DOMAIN_SUFFIX
|
||||||
- auth.DOMAIN_SUFFIX
|
- auth.DOMAIN_SUFFIX
|
||||||
- s3.DOMAIN_SUFFIX
|
- s3.DOMAIN_SUFFIX
|
||||||
|
- grafana.DOMAIN_SUFFIX
|
||||||
|
- admin.DOMAIN_SUFFIX
|
||||||
|
- integration.DOMAIN_SUFFIX
|
||||||
|
|||||||
@@ -3,14 +3,12 @@ kind: Kustomization
|
|||||||
|
|
||||||
# Production overlay — targets Scaleway Elastic Metal (Paris)
|
# Production overlay — targets Scaleway Elastic Metal (Paris)
|
||||||
#
|
#
|
||||||
# Deploy (DOMAIN_SUFFIX and ACME_EMAIL are substituted by sed):
|
# Deploy (DOMAIN_SUFFIX and ACME_EMAIL are substituted by sunbeam apply):
|
||||||
# DOMAIN="yourdomain.com" EMAIL="ops@yourdomain.com"
|
# sunbeam apply --env production --domain yourdomain.com
|
||||||
# kustomize build overlays/production/ \
|
|
||||||
# | sed -e "s/DOMAIN_SUFFIX/${DOMAIN}/g" -e "s/ACME_EMAIL/${EMAIL}/g" \
|
|
||||||
# | kubectl apply --server-side --force-conflicts -f -
|
|
||||||
|
|
||||||
resources:
|
resources:
|
||||||
- ../../base/mesh
|
- ../../base/longhorn
|
||||||
|
- ../../base/cert-manager
|
||||||
- ../../base/ingress
|
- ../../base/ingress
|
||||||
- ../../base/ory
|
- ../../base/ory
|
||||||
- ../../base/data
|
- ../../base/data
|
||||||
@@ -18,20 +16,42 @@ resources:
|
|||||||
- ../../base/lasuite
|
- ../../base/lasuite
|
||||||
- ../../base/media
|
- ../../base/media
|
||||||
- ../../base/devtools
|
- ../../base/devtools
|
||||||
|
- ../../base/vso
|
||||||
|
- ../../base/monitoring
|
||||||
# cert-manager ClusterIssuer + Certificate (requires cert-manager to be installed)
|
# cert-manager ClusterIssuer + Certificate (requires cert-manager to be installed)
|
||||||
- cert-manager.yaml
|
- cert-manager.yaml
|
||||||
|
# CNPG daily backup schedule
|
||||||
|
- postgres-scheduled-backup.yaml
|
||||||
|
|
||||||
images:
|
images:
|
||||||
# Set to your container registry. DOMAIN_SUFFIX is substituted by sed.
|
# La Gaufre integration service — built and pushed by `sunbeam build integration`
|
||||||
- name: sunbeam-proxy
|
- name: integration
|
||||||
newName: src.DOMAIN_SUFFIX/sunbeam/sunbeam-proxy
|
newName: src.DOMAIN_SUFFIX/studio/integration
|
||||||
|
newTag: latest
|
||||||
|
|
||||||
|
# Meet — built from source and pushed to Gitea registry.
|
||||||
|
- name: meet-backend
|
||||||
|
newName: src.DOMAIN_SUFFIX/studio/meet-backend
|
||||||
|
newTag: latest
|
||||||
|
- name: meet-frontend
|
||||||
|
newName: src.DOMAIN_SUFFIX/studio/meet-frontend
|
||||||
newTag: latest
|
newTag: latest
|
||||||
|
|
||||||
patches:
|
patches:
|
||||||
- path: values-pingora.yaml
|
# Pingora host ports — bind :80/:443 to the host network
|
||||||
|
- path: patch-pingora-hostport.yaml
|
||||||
|
|
||||||
# TODO: set OIDC redirect URIs to https://*.sunbeam.pt/...
|
# Production resource limits for 64 GiB server
|
||||||
# - path: values-ory.yaml
|
- path: values-resources.yaml
|
||||||
|
|
||||||
# TODO: set production resource limits (64 GB server)
|
# LiveKit TURN service: ClusterIP (Pingora routes TURN traffic on :443)
|
||||||
# - path: values-resources.yaml
|
- path: patch-livekit-service.yaml
|
||||||
|
|
||||||
|
# CNPG: production sizing (500 Gi, 8 Gi RAM) + barman S3 backup config
|
||||||
|
- path: patch-postgres-production.yaml
|
||||||
|
|
||||||
|
# OpenSearch: expand PVC to 50 Gi
|
||||||
|
- path: patch-opensearch-storage.yaml
|
||||||
|
|
||||||
|
# SeaweedFS volume: expand PVC to 600 Gi
|
||||||
|
- path: patch-seaweedfs-volume-size.yaml
|
||||||
|
|||||||
10
overlays/production/patch-livekit-service.yaml
Normal file
10
overlays/production/patch-livekit-service.yaml
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
# Patch: keep LiveKit TURN service as ClusterIP — Pingora routes external TURN traffic.
|
||||||
|
# Without this patch, klipper-lb (disabled) or the default LoadBalancer type may
|
||||||
|
# conflict with Pingora's host port bindings on port 443.
|
||||||
|
apiVersion: v1
|
||||||
|
kind: Service
|
||||||
|
metadata:
|
||||||
|
name: livekit-server-turn
|
||||||
|
namespace: media
|
||||||
|
spec:
|
||||||
|
type: ClusterIP
|
||||||
10
overlays/production/patch-opensearch-storage.yaml
Normal file
10
overlays/production/patch-opensearch-storage.yaml
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
# Expand OpenSearch PVC to 50 Gi in production.
|
||||||
|
apiVersion: v1
|
||||||
|
kind: PersistentVolumeClaim
|
||||||
|
metadata:
|
||||||
|
name: opensearch-data
|
||||||
|
namespace: data
|
||||||
|
spec:
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 50Gi
|
||||||
25
overlays/production/patch-pingora-hostport.yaml
Normal file
25
overlays/production/patch-pingora-hostport.yaml
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
# Bind Pingora container ports to the host network so external traffic
|
||||||
|
# on ports 80 and 443 reaches the proxy pod directly.
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: pingora
|
||||||
|
namespace: ingress
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: pingora
|
||||||
|
ports:
|
||||||
|
- name: http
|
||||||
|
containerPort: 80
|
||||||
|
hostPort: 80
|
||||||
|
protocol: TCP
|
||||||
|
- name: https
|
||||||
|
containerPort: 443
|
||||||
|
hostPort: 443
|
||||||
|
protocol: TCP
|
||||||
|
- name: ssh
|
||||||
|
containerPort: 22
|
||||||
|
hostPort: 22
|
||||||
|
protocol: TCP
|
||||||
45
overlays/production/patch-postgres-production.yaml
Normal file
45
overlays/production/patch-postgres-production.yaml
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
# Production CNPG cluster sizing for 12-core, 64 GiB Elastic Metal.
|
||||||
|
# Barman backs up WAL + base backups to Scaleway Object Storage (s3://sunbeam-backups/postgres).
|
||||||
|
# barman-s3-creds K8s Secret is synced by VSO from secret/scaleway-s3 in OpenBao.
|
||||||
|
apiVersion: postgresql.cnpg.io/v1
|
||||||
|
kind: Cluster
|
||||||
|
metadata:
|
||||||
|
name: postgres
|
||||||
|
namespace: data
|
||||||
|
spec:
|
||||||
|
instances: 1
|
||||||
|
|
||||||
|
postgresql:
|
||||||
|
parameters:
|
||||||
|
max_connections: "200"
|
||||||
|
shared_buffers: "2GB"
|
||||||
|
effective_cache_size: "6GB"
|
||||||
|
work_mem: "16MB"
|
||||||
|
maintenance_work_mem: "512MB"
|
||||||
|
|
||||||
|
storage:
|
||||||
|
size: 100Gi
|
||||||
|
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: 4Gi
|
||||||
|
cpu: "2"
|
||||||
|
limits:
|
||||||
|
memory: 8Gi
|
||||||
|
|
||||||
|
backup:
|
||||||
|
barmanObjectStore:
|
||||||
|
destinationPath: "s3://sunbeam-backups/postgres"
|
||||||
|
endpointURL: "https://s3.fr-par.scw.cloud"
|
||||||
|
s3Credentials:
|
||||||
|
accessKeyId:
|
||||||
|
name: barman-s3-creds
|
||||||
|
key: ACCESS_KEY_ID
|
||||||
|
secretAccessKey:
|
||||||
|
name: barman-s3-creds
|
||||||
|
key: ACCESS_SECRET_KEY
|
||||||
|
wal:
|
||||||
|
compression: gzip
|
||||||
|
data:
|
||||||
|
compression: gzip
|
||||||
|
retentionPolicy: "30d"
|
||||||
15
overlays/production/patch-seaweedfs-volume-size.yaml
Normal file
15
overlays/production/patch-seaweedfs-volume-size.yaml
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
# Expand SeaweedFS volume PVC to 600 Gi in production.
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: StatefulSet
|
||||||
|
metadata:
|
||||||
|
name: seaweedfs-volume
|
||||||
|
namespace: storage
|
||||||
|
spec:
|
||||||
|
volumeClaimTemplates:
|
||||||
|
- metadata:
|
||||||
|
name: data
|
||||||
|
spec:
|
||||||
|
accessModes: [ReadWriteOnce]
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
storage: 600Gi
|
||||||
12
overlays/production/postgres-scheduled-backup.yaml
Normal file
12
overlays/production/postgres-scheduled-backup.yaml
Normal file
@@ -0,0 +1,12 @@
|
|||||||
|
apiVersion: postgresql.cnpg.io/v1
|
||||||
|
kind: ScheduledBackup
|
||||||
|
metadata:
|
||||||
|
name: postgres-daily
|
||||||
|
namespace: data
|
||||||
|
spec:
|
||||||
|
# Daily at 02:00 UTC
|
||||||
|
schedule: "0 2 * * *"
|
||||||
|
backupOwnerReference: self
|
||||||
|
cluster:
|
||||||
|
name: postgres
|
||||||
|
method: barmanObjectStore
|
||||||
293
overlays/production/values-resources.yaml
Normal file
293
overlays/production/values-resources.yaml
Normal file
@@ -0,0 +1,293 @@
|
|||||||
|
# Production resource limits — Scaleway Elastic Metal, 12 cores, 64 GiB RAM.
|
||||||
|
# ~10 GiB reserved for OS + k3s + Linkerd mesh overhead.
|
||||||
|
# Replicas scaled up for production workloads.
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: meet-celery-worker
|
||||||
|
namespace: lasuite
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: meet-celery-worker
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: 256Mi
|
||||||
|
cpu: 100m
|
||||||
|
limits:
|
||||||
|
memory: 1Gi
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: cloudnative-pg
|
||||||
|
namespace: data
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: manager
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: 256Mi
|
||||||
|
cpu: 100m
|
||||||
|
limits:
|
||||||
|
memory: 512Mi
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: livekit-server
|
||||||
|
namespace: media
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: livekit-server
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: 512Mi
|
||||||
|
cpu: 500m
|
||||||
|
limits:
|
||||||
|
memory: 2Gi
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: pingora
|
||||||
|
namespace: ingress
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: pingora
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: 128Mi
|
||||||
|
cpu: 250m
|
||||||
|
limits:
|
||||||
|
memory: 512Mi
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: valkey
|
||||||
|
namespace: data
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: valkey
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: 128Mi
|
||||||
|
cpu: 50m
|
||||||
|
limits:
|
||||||
|
memory: 512Mi
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: opensearch
|
||||||
|
namespace: data
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: opensearch
|
||||||
|
env:
|
||||||
|
- name: OPENSEARCH_JAVA_OPTS
|
||||||
|
value: "-Xms2g -Xmx4g"
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: 2Gi
|
||||||
|
cpu: 500m
|
||||||
|
limits:
|
||||||
|
memory: 5Gi
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: seaweedfs-filer
|
||||||
|
namespace: storage
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: filer
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: 256Mi
|
||||||
|
cpu: 100m
|
||||||
|
limits:
|
||||||
|
memory: 1Gi
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: hydra-hydra-maester
|
||||||
|
namespace: ory
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: hydra-maester
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: 32Mi
|
||||||
|
cpu: 25m
|
||||||
|
limits:
|
||||||
|
memory: 128Mi
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: login-ui
|
||||||
|
namespace: ory
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: login-ui
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: 128Mi
|
||||||
|
cpu: 50m
|
||||||
|
limits:
|
||||||
|
memory: 384Mi
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: hive
|
||||||
|
namespace: lasuite
|
||||||
|
spec:
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: hive
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: 64Mi
|
||||||
|
limits:
|
||||||
|
memory: 256Mi
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: people-backend
|
||||||
|
namespace: lasuite
|
||||||
|
spec:
|
||||||
|
replicas: 2
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: people-celery-worker
|
||||||
|
namespace: lasuite
|
||||||
|
spec:
|
||||||
|
replicas: 2
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: people-frontend
|
||||||
|
namespace: lasuite
|
||||||
|
spec:
|
||||||
|
replicas: 2
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: docs-celery-worker
|
||||||
|
namespace: lasuite
|
||||||
|
spec:
|
||||||
|
replicas: 2
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: docs
|
||||||
|
env:
|
||||||
|
- name: CELERY_WORKER_CONCURRENCY
|
||||||
|
value: "4"
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: 512Mi
|
||||||
|
cpu: 250m
|
||||||
|
limits:
|
||||||
|
memory: 1Gi
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: docs-backend
|
||||||
|
namespace: lasuite
|
||||||
|
spec:
|
||||||
|
replicas: 2
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: docs
|
||||||
|
env:
|
||||||
|
- name: WEB_CONCURRENCY
|
||||||
|
value: "4"
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: 512Mi
|
||||||
|
cpu: 250m
|
||||||
|
limits:
|
||||||
|
memory: 1Gi
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: docs-frontend
|
||||||
|
namespace: lasuite
|
||||||
|
spec:
|
||||||
|
replicas: 2
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: docs
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: 64Mi
|
||||||
|
limits:
|
||||||
|
memory: 256Mi
|
||||||
|
|
||||||
|
---
|
||||||
|
apiVersion: apps/v1
|
||||||
|
kind: Deployment
|
||||||
|
metadata:
|
||||||
|
name: docs-y-provider
|
||||||
|
namespace: lasuite
|
||||||
|
spec:
|
||||||
|
replicas: 1
|
||||||
|
template:
|
||||||
|
spec:
|
||||||
|
containers:
|
||||||
|
- name: docs
|
||||||
|
resources:
|
||||||
|
requests:
|
||||||
|
memory: 256Mi
|
||||||
|
cpu: 100m
|
||||||
|
limits:
|
||||||
|
memory: 1Gi
|
||||||
Reference in New Issue
Block a user