feat(infra): production bootstrap — cert-manager, longhorn, monitoring
Add new bases for cert-manager (Let's Encrypt + wildcard cert), Longhorn distributed storage, and monitoring (kube-prometheus-stack + Loki + Tempo + Grafana OIDC). Add cloud-init for Scaleway Elastic Metal provisioning. Production overlay: add patches for postgres sizing, SeaweedFS volume, OpenSearch storage, LiveKit service, Pingora host ports, resource limits, and CNPG daily barman backups. Update cert-manager.yaml with full dnsNames for all *.sunbeam.pt subdomains.
This commit is contained in:
15
base/cert-manager/kustomization.yaml
Normal file
15
base/cert-manager/kustomization.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
resources:
|
||||
- namespace.yaml
|
||||
|
||||
helmCharts:
|
||||
# helm repo add jetstack https://charts.jetstack.io
|
||||
- name: cert-manager
|
||||
repo: https://charts.jetstack.io
|
||||
version: "1.19.4"
|
||||
releaseName: cert-manager
|
||||
namespace: cert-manager
|
||||
valuesFile: values.yaml
|
||||
includeCRDs: true
|
||||
4
base/cert-manager/namespace.yaml
Normal file
4
base/cert-manager/namespace.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: cert-manager
|
||||
2
base/cert-manager/values.yaml
Normal file
2
base/cert-manager/values.yaml
Normal file
@@ -0,0 +1,2 @@
|
||||
crds:
|
||||
enabled: true
|
||||
13
base/longhorn/kustomization.yaml
Normal file
13
base/longhorn/kustomization.yaml
Normal file
@@ -0,0 +1,13 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
resources:
|
||||
- namespace.yaml
|
||||
|
||||
helmCharts:
|
||||
- name: longhorn
|
||||
repo: https://charts.longhorn.io
|
||||
version: "1.11.0"
|
||||
releaseName: longhorn
|
||||
namespace: longhorn-system
|
||||
valuesFile: values.yaml
|
||||
4
base/longhorn/namespace.yaml
Normal file
4
base/longhorn/namespace.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: longhorn-system
|
||||
24
base/longhorn/values.yaml
Normal file
24
base/longhorn/values.yaml
Normal file
@@ -0,0 +1,24 @@
|
||||
# Longhorn distributed block storage
|
||||
# Single-node production deployment — RAID1 hardware already provides redundancy.
|
||||
|
||||
defaultSettings:
|
||||
# 1 replica: RAID1 mirrors the disk, so software replication adds no benefit
|
||||
# on a single node and would halve available capacity.
|
||||
defaultReplicaCount: 1
|
||||
|
||||
# Orphan auto-deletion: clean up node/instance orphaned resources automatically
|
||||
orphanAutoDeletion: true
|
||||
|
||||
# Allow volumes to be scheduled on the only available node even when disk
|
||||
# pressure is detected (single-node: no other node to reschedule to).
|
||||
allowVolumeCreationWithDegradedAvailability: true
|
||||
|
||||
# Reduce reserved percentage to 10% — RAID1 hardware provides physical redundancy,
|
||||
# so Longhorn doesn't need to hold back 30% for software replicas.
|
||||
# With 937 GiB disk: 843 GiB schedulable (600 SW + 100 PG + ~143 headroom).
|
||||
storageReservedPercentageForDefaultDisk: 10
|
||||
|
||||
# Set Longhorn as the default StorageClass.
|
||||
persistence:
|
||||
defaultClass: true
|
||||
defaultClassReplicaCount: 1
|
||||
32
base/monitoring/grafana-oauth2client.yaml
Normal file
32
base/monitoring/grafana-oauth2client.yaml
Normal file
@@ -0,0 +1,32 @@
|
||||
# Hydra OAuth2Client for Grafana OIDC sign-in.
|
||||
#
|
||||
# Hydra Maester watches this CRD and:
|
||||
# 1. Registers the client with Hydra
|
||||
# 2. Creates K8s Secret "grafana-oidc" in monitoring namespace
|
||||
# with CLIENT_ID and CLIENT_SECRET keys.
|
||||
#
|
||||
# Grafana picks up the secret via envFromSecret and interpolates
|
||||
# ${CLIENT_ID} / ${CLIENT_SECRET} in grafana.ini at startup.
|
||||
#
|
||||
# DOMAIN_SUFFIX is substituted by sunbeam apply.
|
||||
---
|
||||
apiVersion: hydra.ory.sh/v1alpha1
|
||||
kind: OAuth2Client
|
||||
metadata:
|
||||
name: grafana
|
||||
namespace: monitoring
|
||||
spec:
|
||||
clientName: Grafana
|
||||
grantTypes:
|
||||
- authorization_code
|
||||
- refresh_token
|
||||
responseTypes:
|
||||
- code
|
||||
scope: openid email profile
|
||||
redirectUris:
|
||||
- https://grafana.DOMAIN_SUFFIX/login/generic_oauth
|
||||
postLogoutRedirectUris:
|
||||
- https://grafana.DOMAIN_SUFFIX/
|
||||
tokenEndpointAuthMethod: client_secret_post
|
||||
secretName: grafana-oidc
|
||||
skipConsent: true
|
||||
34
base/monitoring/kustomization.yaml
Normal file
34
base/monitoring/kustomization.yaml
Normal file
@@ -0,0 +1,34 @@
|
||||
apiVersion: kustomize.config.k8s.io/v1beta1
|
||||
kind: Kustomization
|
||||
|
||||
namespace: monitoring
|
||||
|
||||
resources:
|
||||
- namespace.yaml
|
||||
- vault-secrets.yaml
|
||||
- grafana-oauth2client.yaml
|
||||
|
||||
helmCharts:
|
||||
# helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
|
||||
- name: kube-prometheus-stack
|
||||
repo: https://prometheus-community.github.io/helm-charts
|
||||
version: "82.9.0"
|
||||
releaseName: kube-prometheus-stack
|
||||
namespace: monitoring
|
||||
valuesFile: prometheus-values.yaml
|
||||
includeCRDs: true
|
||||
|
||||
# helm repo add grafana https://grafana.github.io/helm-charts
|
||||
- name: loki
|
||||
repo: https://grafana.github.io/helm-charts
|
||||
version: "6.53.0"
|
||||
releaseName: loki
|
||||
namespace: monitoring
|
||||
valuesFile: loki-values.yaml
|
||||
|
||||
- name: tempo
|
||||
repo: https://grafana.github.io/helm-charts
|
||||
version: "1.24.4"
|
||||
releaseName: tempo
|
||||
namespace: monitoring
|
||||
valuesFile: tempo-values.yaml
|
||||
43
base/monitoring/loki-values.yaml
Normal file
43
base/monitoring/loki-values.yaml
Normal file
@@ -0,0 +1,43 @@
|
||||
# Loki — monolithic single-binary mode, filesystem storage, single tenant.
|
||||
deploymentMode: SingleBinary
|
||||
|
||||
loki:
|
||||
auth_enabled: false
|
||||
commonConfig:
|
||||
replication_factor: 1
|
||||
storage:
|
||||
type: filesystem
|
||||
schemaConfig:
|
||||
configs:
|
||||
- from: "2024-01-01"
|
||||
store: tsdb
|
||||
object_store: filesystem
|
||||
schema: v13
|
||||
index:
|
||||
prefix: index_
|
||||
period: 24h
|
||||
|
||||
singleBinary:
|
||||
replicas: 1
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 30Gi
|
||||
|
||||
# Disable sub-charts/probes not needed for single-node
|
||||
backend:
|
||||
replicas: 0
|
||||
read:
|
||||
replicas: 0
|
||||
write:
|
||||
replicas: 0
|
||||
|
||||
monitoring:
|
||||
selfMonitoring:
|
||||
enabled: false
|
||||
grafanaAgent:
|
||||
installOperator: false
|
||||
lokiCanary:
|
||||
enabled: false
|
||||
|
||||
test:
|
||||
enabled: false
|
||||
4
base/monitoring/namespace.yaml
Normal file
4
base/monitoring/namespace.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
apiVersion: v1
|
||||
kind: Namespace
|
||||
metadata:
|
||||
name: monitoring
|
||||
96
base/monitoring/prometheus-values.yaml
Normal file
96
base/monitoring/prometheus-values.yaml
Normal file
@@ -0,0 +1,96 @@
|
||||
# kube-prometheus-stack — Prometheus + AlertManager + Grafana + node-exporter + kube-state-metrics
|
||||
#
|
||||
# k3s quirks: kube-proxy is replaced by Cilium; etcd/scheduler/controller-manager
|
||||
# don't expose metrics on standard ports. Disable their monitors to avoid noise.
|
||||
|
||||
grafana:
|
||||
adminUser: admin
|
||||
admin:
|
||||
existingSecret: grafana-admin
|
||||
passwordKey: admin-password
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 2Gi
|
||||
# Inject Hydra OIDC client credentials (created by Hydra Maester from the OAuth2Client CRD)
|
||||
envFromSecret: grafana-oidc
|
||||
grafana.ini:
|
||||
server:
|
||||
root_url: "https://grafana.DOMAIN_SUFFIX"
|
||||
auth:
|
||||
# Keep local login as fallback (admin password from grafana-admin secret)
|
||||
disable_login_form: false
|
||||
signout_redirect_url: "https://auth.DOMAIN_SUFFIX/oauth2/sessions/logout"
|
||||
auth.generic_oauth:
|
||||
enabled: true
|
||||
name: Sunbeam
|
||||
icon: signin
|
||||
# CLIENT_ID / CLIENT_SECRET injected from grafana-oidc K8s Secret via envFromSecret
|
||||
client_id: "${CLIENT_ID}"
|
||||
client_secret: "${CLIENT_SECRET}"
|
||||
scopes: "openid email profile"
|
||||
auth_url: "https://auth.DOMAIN_SUFFIX/oauth2/auth"
|
||||
token_url: "https://auth.DOMAIN_SUFFIX/oauth2/token"
|
||||
api_url: "https://auth.DOMAIN_SUFFIX/userinfo"
|
||||
allow_sign_up: true
|
||||
# Small studio — anyone with a valid La Suite account is an admin.
|
||||
# To restrict to specific users, set role_attribute_path instead.
|
||||
auto_assign_org_role: Admin
|
||||
skip_org_role_sync: true
|
||||
additionalDataSources:
|
||||
- name: Loki
|
||||
type: loki
|
||||
url: http://loki.monitoring.svc.cluster.local:3100
|
||||
access: proxy
|
||||
isDefault: false
|
||||
- name: Tempo
|
||||
type: tempo
|
||||
url: http://tempo.monitoring.svc.cluster.local:3100
|
||||
access: proxy
|
||||
isDefault: false
|
||||
|
||||
prometheus:
|
||||
prometheusSpec:
|
||||
retention: 90d
|
||||
storageSpec:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
accessModes: [ReadWriteOnce]
|
||||
resources:
|
||||
requests:
|
||||
storage: 30Gi
|
||||
|
||||
alertmanager:
|
||||
alertmanagerSpec:
|
||||
storage:
|
||||
volumeClaimTemplate:
|
||||
spec:
|
||||
accessModes: [ReadWriteOnce]
|
||||
resources:
|
||||
requests:
|
||||
storage: 2Gi
|
||||
config:
|
||||
global:
|
||||
smtp_from: "alerts@DOMAIN_SUFFIX"
|
||||
smtp_smarthost: "postfix.lasuite.svc.cluster.local:25"
|
||||
smtp_require_tls: false
|
||||
route:
|
||||
group_by: [alertname, namespace]
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 12h
|
||||
receiver: email
|
||||
receivers:
|
||||
- name: email
|
||||
email_configs:
|
||||
- to: "ops@DOMAIN_SUFFIX"
|
||||
send_resolved: true
|
||||
|
||||
# Disable monitors for components k3s doesn't expose
|
||||
kubeEtcd:
|
||||
enabled: false
|
||||
kubeControllerManager:
|
||||
enabled: false
|
||||
kubeScheduler:
|
||||
enabled: false
|
||||
kubeProxy:
|
||||
enabled: false
|
||||
26
base/monitoring/tempo-values.yaml
Normal file
26
base/monitoring/tempo-values.yaml
Normal file
@@ -0,0 +1,26 @@
|
||||
# Tempo — monolithic single-binary, local filesystem backend.
|
||||
# Receives OTLP over gRPC (:4317) and HTTP (:4318).
|
||||
tempo:
|
||||
reportingEnabled: false
|
||||
receivers:
|
||||
otlp:
|
||||
protocols:
|
||||
grpc:
|
||||
endpoint: "0.0.0.0:4317"
|
||||
http:
|
||||
endpoint: "0.0.0.0:4318"
|
||||
storage:
|
||||
trace:
|
||||
backend: local
|
||||
local:
|
||||
path: /var/tempo/traces
|
||||
wal:
|
||||
path: /var/tempo/wal
|
||||
|
||||
persistence:
|
||||
enabled: true
|
||||
size: 20Gi
|
||||
|
||||
# Expose OTLP ports as a ClusterIP service
|
||||
service:
|
||||
type: ClusterIP
|
||||
36
base/monitoring/vault-secrets.yaml
Normal file
36
base/monitoring/vault-secrets.yaml
Normal file
@@ -0,0 +1,36 @@
|
||||
---
|
||||
apiVersion: secrets.hashicorp.com/v1beta1
|
||||
kind: VaultAuth
|
||||
metadata:
|
||||
name: vso-auth
|
||||
namespace: monitoring
|
||||
spec:
|
||||
method: kubernetes
|
||||
mount: kubernetes
|
||||
kubernetes:
|
||||
role: vso
|
||||
serviceAccount: default
|
||||
---
|
||||
# Grafana admin password from OpenBao KV at secret/grafana.
|
||||
apiVersion: secrets.hashicorp.com/v1beta1
|
||||
kind: VaultStaticSecret
|
||||
metadata:
|
||||
name: grafana-admin
|
||||
namespace: monitoring
|
||||
spec:
|
||||
vaultAuthRef: vso-auth
|
||||
mount: secret
|
||||
type: kv-v2
|
||||
path: grafana
|
||||
refreshAfter: 30s
|
||||
destination:
|
||||
name: grafana-admin
|
||||
create: true
|
||||
overwrite: true
|
||||
transformation:
|
||||
excludeRaw: true
|
||||
templates:
|
||||
admin-password:
|
||||
text: "{{ index .Secrets \"admin-password\" }}"
|
||||
admin-user:
|
||||
text: "admin"
|
||||
Reference in New Issue
Block a user