feat(data): upgrade OpenSearch to v3 with ML Commons for neural search

- Upgrade from OpenSearch 2 to 3 (required for ML Commons pre-trained models) - Rename PLUGINS_SECURITY_DISABLED → DISABLE_SECURITY_PLUGIN (OS3 change) - Enable ML Commons plugin settings for on-data-node inference - Increase memory limits (2Gi) and JVM heap for neural model inference - Add fsGroup security context for volume permissions
2026-03-10 18:52:29 +00:00
parent d2148335de
commit 584e98316b
2 changed files with 164 additions and 4 deletions
--- a/base/data/opensearch-deployment.yaml
+++ b/base/data/opensearch-deployment.yaml
@@ -13,6 +13,8 @@ spec:
      labels:
        app: opensearch
    spec:
+      securityContext:
+        fsGroup: 1000
      initContainers:
        - name: sysctl
          image: busybox
@@ -21,7 +23,7 @@ spec:
            privileged: true
      containers:
        - name: opensearch
-          image: opensearchproject/opensearch:2
+          image: opensearchproject/opensearch:3
          ports:
            - name: http
              containerPort: 9200
@@ -33,14 +35,22 @@ spec:
            - name: discovery.type
              value: single-node
            - name: OPENSEARCH_JAVA_OPTS
-              value: "-Xms512m -Xmx1g"
+              value: "-Xms1g -Xmx1536m"
            - name: DISABLE_SECURITY_PLUGIN
              value: "true"
+            - name: plugins.ml_commons.only_run_on_ml_node
+              value: "false"
+            - name: plugins.ml_commons.native_memory_threshold
+              value: "90"
+            - name: plugins.ml_commons.model_access_control_enabled
+              value: "false"
+            - name: plugins.ml_commons.allow_registering_model_via_url
+              value: "true"
          resources:
            limits:
-              memory: 1500Mi
+              memory: 2Gi
            requests:
-              memory: 768Mi
+              memory: 1Gi
              cpu: 100m
          volumeMounts:
            - name: data
--- a/base/data/opensearch-ml-init-job.yaml
+++ b/base/data/opensearch-ml-init-job.yaml
@@ -0,0 +1,150 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: opensearch-ml-init
+  namespace: data
+  annotations:
+    helm.sh/hook: post-install
+spec:
+  template:
+    spec:
+      restartPolicy: OnFailure
+      containers:
+        - name: init
+          image: curlimages/curl:latest
+          command:
+            - /bin/sh
+            - -c
+            - |
+              set -e
+              OS=http://opensearch.data.svc.cluster.local:9200
+
+              echo "Waiting for OpenSearch..."
+              until curl -sf "$OS/_cluster/health" >/dev/null 2>&1; do
+                sleep 5
+              done
+              echo "OpenSearch is ready."
+
+              # --- Idempotent: check if model already exists before registering ---
+
+              # Check for existing deployed model by name
+              EXISTING_MODEL_ID=$(curl -sf -X POST "$OS/_plugins/_ml/models/_search" \
+                -H 'Content-Type: application/json' \
+                -d '{"query":{"bool":{"must":[{"term":{"name":"bge-large-en-v1.5"}}]}}}' \
+                | sed -n 's/.*"_id":"\([^"]*\)".*/\1/p' | head -1)
+
+              if [ -n "$EXISTING_MODEL_ID" ]; then
+                echo "Model already registered: $EXISTING_MODEL_ID"
+                MODEL_ID="$EXISTING_MODEL_ID"
+
+                # Ensure it's deployed
+                STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" \
+                  | sed -n 's/.*"model_state":"\([^"]*\)".*/\1/p')
+                if [ "$STATE" != "DEPLOYED" ]; then
+                  echo "Model not deployed (state=$STATE), deploying..."
+                  curl -sf -X POST "$OS/_plugins/_ml/models/$MODEL_ID/_deploy" || true
+                  for i in $(seq 1 30); do
+                    STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" \
+                      | sed -n 's/.*"model_state":"\([^"]*\)".*/\1/p')
+                    echo "  state: $STATE"
+                    if [ "$STATE" = "DEPLOYED" ]; then break; fi
+                    sleep 5
+                  done
+                fi
+              else
+                # 1. Create model group (idempotent — OpenSearch deduplicates by name)
+                echo "Creating model group..."
+                GROUP_ID=$(curl -sf -X POST "$OS/_plugins/_ml/model_groups/_register" \
+                  -H 'Content-Type: application/json' \
+                  -d '{"name":"sunbeam-embeddings","description":"Sunbeam embedding models"}' \
+                  | sed -n 's/.*"model_group_id":"\([^"]*\)".*/\1/p')
+                echo "Model group: $GROUP_ID"
+
+                # 2. Register BGE-large-en-v1.5 model from SeaweedFS bucket
+                echo "Registering BGE-large-en-v1.5 model..."
+                TASK_ID=$(curl -sf -X POST "$OS/_plugins/_ml/models/_register" \
+                  -H 'Content-Type: application/json' \
+                  -d "{
+                    \"name\": \"bge-large-en-v1.5\",
+                    \"version\": \"1.0.0\",
+                    \"model_format\": \"ONNX\",
+                    \"model_group_id\": \"$GROUP_ID\",
+                    \"model_config\": {
+                      \"model_type\": \"bert\",
+                      \"embedding_dimension\": 1024,
+                      \"framework_type\": \"sentence_transformers\",
+                      \"all_config\": \"{\\\"_name_or_path\\\":\\\"BAAI/bge-large-en-v1.5\\\",\\\"model_type\\\":\\\"bert\\\"}\"
+                    },
+                    \"url\": \"http://seaweedfs-filer.storage.svc.cluster.local:8333/sunbeam-ml-models/bge-large-en-v1.5.zip\"
+                  }" \
+                  | sed -n 's/.*"task_id":"\([^"]*\)".*/\1/p')
+                echo "Registration task: $TASK_ID"
+
+                # Wait for model registration
+                echo "Waiting for model registration..."
+                for i in $(seq 1 60); do
+                  STATUS=$(curl -sf "$OS/_plugins/_ml/tasks/$TASK_ID" \
+                    | sed -n 's/.*"state":"\([^"]*\)".*/\1/p')
+                  echo "  status: $STATUS"
+                  if [ "$STATUS" = "COMPLETED" ]; then break; fi
+                  if [ "$STATUS" = "FAILED" ]; then
+                    echo "Model registration failed!"
+                    curl -sf "$OS/_plugins/_ml/tasks/$TASK_ID"
+                    exit 1
+                  fi
+                  sleep 10
+                done
+
+                MODEL_ID=$(curl -sf "$OS/_plugins/_ml/tasks/$TASK_ID" \
+                  | sed -n 's/.*"model_id":"\([^"]*\)".*/\1/p')
+                echo "Model ID: $MODEL_ID"
+
+                # 3. Deploy the model
+                echo "Deploying model..."
+                curl -sf -X POST "$OS/_plugins/_ml/models/$MODEL_ID/_deploy"
+
+                echo "Waiting for model deployment..."
+                for i in $(seq 1 30); do
+                  STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" \
+                    | sed -n 's/.*"model_state":"\([^"]*\)".*/\1/p')
+                  echo "  state: $STATE"
+                  if [ "$STATE" = "DEPLOYED" ]; then break; fi
+                  sleep 5
+                done
+              fi
+
+              # 4. Create/update ingest pipeline (PUT is idempotent)
+              echo "Creating ingest pipeline..."
+              curl -sf -X PUT "$OS/_ingest/pipeline/tuwunel_embedding_pipeline" \
+                -H 'Content-Type: application/json' \
+                -d "{
+                  \"description\": \"Tuwunel message embedding pipeline\",
+                  \"processors\": [{
+                    \"text_embedding\": {
+                      \"model_id\": \"$MODEL_ID\",
+                      \"field_map\": {
+                        \"body\": \"embedding\"
+                      }
+                    }
+                  }]
+                }"
+
+              # 5. Create/update search pipeline (PUT is idempotent)
+              echo "Creating search pipeline..."
+              curl -sf -X PUT "$OS/_search/pipeline/tuwunel_hybrid_pipeline" \
+                -H 'Content-Type: application/json' \
+                -d '{
+                  "description": "Tuwunel hybrid BM25+neural search pipeline",
+                  "phase_results_processors": [{
+                    "normalization-processor": {
+                      "normalization": { "technique": "min_max" },
+                      "combination": {
+                        "technique": "arithmetic_mean",
+                        "parameters": { "weights": [0.3, 0.7] }
+                      }
+                    }
+                  }]
+                }'
+
+              echo "OpenSearch ML init complete. Model ID: $MODEL_ID"
+              echo "Update tuwunel config search_opensearch_model_id with: $MODEL_ID"