sbbb/base/data/opensearch-ml-init-job.yaml

apiVersion: batch/v1
kind: Job
metadata:
  name: opensearch-ml-init
  namespace: data
  annotations:
    helm.sh/hook: post-install
spec:
  template:
    spec:
      restartPolicy: OnFailure
      containers:
        - name: init
          image: curlimages/curl:latest
          command:
            - /bin/sh
            - -c
            - |
              set -e
              OS=http://opensearch.data.svc.cluster.local:9200

              echo "Waiting for OpenSearch..."
              until curl -sf "$OS/_cluster/health" >/dev/null 2>&1; do
                sleep 5
              done
              echo "OpenSearch is ready."

              # --- Idempotent: check if model already exists before registering ---

              # Check for existing deployed model by name
              EXISTING_MODEL_ID=$(curl -sf -X POST "$OS/_plugins/_ml/models/_search" \
                -H 'Content-Type: application/json' \
                -d '{"query":{"bool":{"must":[{"term":{"name":"bge-large-en-v1.5"}}]}}}' \
                | sed -n 's/.*"_id":"\([^"]*\)".*/\1/p' | head -1)

              if [ -n "$EXISTING_MODEL_ID" ]; then
                echo "Model already registered: $EXISTING_MODEL_ID"
                MODEL_ID="$EXISTING_MODEL_ID"

                # Ensure it's deployed
                STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" \
                  | sed -n 's/.*"model_state":"\([^"]*\)".*/\1/p')
                if [ "$STATE" != "DEPLOYED" ]; then
                  echo "Model not deployed (state=$STATE), deploying..."
                  curl -sf -X POST "$OS/_plugins/_ml/models/$MODEL_ID/_deploy" || true
                  for i in $(seq 1 30); do
                    STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" \
                      | sed -n 's/.*"model_state":"\([^"]*\)".*/\1/p')
                    echo "  state: $STATE"
                    if [ "$STATE" = "DEPLOYED" ]; then break; fi
                    sleep 5
                  done
                fi
              else
                # 1. Create model group (idempotent — OpenSearch deduplicates by name)
                echo "Creating model group..."
                GROUP_ID=$(curl -sf -X POST "$OS/_plugins/_ml/model_groups/_register" \
                  -H 'Content-Type: application/json' \
                  -d '{"name":"sunbeam-embeddings","description":"Sunbeam embedding models"}' \
                  | sed -n 's/.*"model_group_id":"\([^"]*\)".*/\1/p')
                echo "Model group: $GROUP_ID"

                # 2. Register BGE-large-en-v1.5 model from SeaweedFS bucket
                echo "Registering BGE-large-en-v1.5 model..."
                TASK_ID=$(curl -sf -X POST "$OS/_plugins/_ml/models/_register" \
                  -H 'Content-Type: application/json' \
                  -d "{
                    \"name\": \"bge-large-en-v1.5\",
                    \"version\": \"1.0.0\",
                    \"model_format\": \"ONNX\",
                    \"model_group_id\": \"$GROUP_ID\",
                    \"model_config\": {
                      \"model_type\": \"bert\",
                      \"embedding_dimension\": 1024,
                      \"framework_type\": \"sentence_transformers\",
                      \"all_config\": \"{\\\"_name_or_path\\\":\\\"BAAI/bge-large-en-v1.5\\\",\\\"model_type\\\":\\\"bert\\\"}\"
                    },
                    \"url\": \"http://seaweedfs-filer.storage.svc.cluster.local:8333/sunbeam-ml-models/bge-large-en-v1.5.zip\"
                  }" \
                  | sed -n 's/.*"task_id":"\([^"]*\)".*/\1/p')
                echo "Registration task: $TASK_ID"

                # Wait for model registration
                echo "Waiting for model registration..."
                for i in $(seq 1 60); do
                  STATUS=$(curl -sf "$OS/_plugins/_ml/tasks/$TASK_ID" \
                    | sed -n 's/.*"state":"\([^"]*\)".*/\1/p')
                  echo "  status: $STATUS"
                  if [ "$STATUS" = "COMPLETED" ]; then break; fi
                  if [ "$STATUS" = "FAILED" ]; then
                    echo "Model registration failed!"
                    curl -sf "$OS/_plugins/_ml/tasks/$TASK_ID"
                    exit 1
                  fi
                  sleep 10
                done

                MODEL_ID=$(curl -sf "$OS/_plugins/_ml/tasks/$TASK_ID" \
                  | sed -n 's/.*"model_id":"\([^"]*\)".*/\1/p')
                echo "Model ID: $MODEL_ID"

                # 3. Deploy the model
                echo "Deploying model..."
                curl -sf -X POST "$OS/_plugins/_ml/models/$MODEL_ID/_deploy"

                echo "Waiting for model deployment..."
                for i in $(seq 1 30); do
                  STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" \
                    | sed -n 's/.*"model_state":"\([^"]*\)".*/\1/p')
                  echo "  state: $STATE"
                  if [ "$STATE" = "DEPLOYED" ]; then break; fi
                  sleep 5
                done
              fi

              # 4. Create/update ingest pipeline (PUT is idempotent)
              echo "Creating ingest pipeline..."
              curl -sf -X PUT "$OS/_ingest/pipeline/tuwunel_embedding_pipeline" \
                -H 'Content-Type: application/json' \
                -d "{
                  \"description\": \"Tuwunel message embedding pipeline\",
                  \"processors\": [{
                    \"text_embedding\": {
                      \"model_id\": \"$MODEL_ID\",
                      \"field_map\": {
                        \"body\": \"embedding\"
                      }
                    }
                  }]
                }"

              # 5. Create/update search pipeline (PUT is idempotent)
              echo "Creating search pipeline..."
              curl -sf -X PUT "$OS/_search/pipeline/tuwunel_hybrid_pipeline" \
                -H 'Content-Type: application/json' \
                -d '{
                  "description": "Tuwunel hybrid BM25+neural search pipeline",
                  "phase_results_processors": [{
                    "normalization-processor": {
                      "normalization": { "technique": "min_max" },
                      "combination": {
                        "technique": "arithmetic_mean",
                        "parameters": { "weights": [0.3, 0.7] }
                      }
                    }
                  }]
                }'

              echo "OpenSearch ML init complete. Model ID: $MODEL_ID"
              echo "Update tuwunel config search_opensearch_model_id with: $MODEL_ID"