apiVersion: batch/v1 kind: Job metadata: name: opensearch-ml-init namespace: data annotations: helm.sh/hook: post-install spec: template: spec: restartPolicy: OnFailure containers: - name: init image: curlimages/curl:latest command: - /bin/sh - -c - | set -e OS=http://opensearch.data.svc.cluster.local:9200 echo "Waiting for OpenSearch..." until curl -sf "$OS/_cluster/health" >/dev/null 2>&1; do sleep 5 done echo "OpenSearch is ready." # --- Idempotent: check if model already exists before registering --- # Check for existing deployed model by name EXISTING_MODEL_ID=$(curl -sf -X POST "$OS/_plugins/_ml/models/_search" \ -H 'Content-Type: application/json' \ -d '{"query":{"bool":{"must":[{"term":{"name":"bge-large-en-v1.5"}}]}}}' \ | sed -n 's/.*"_id":"\([^"]*\)".*/\1/p' | head -1) if [ -n "$EXISTING_MODEL_ID" ]; then echo "Model already registered: $EXISTING_MODEL_ID" MODEL_ID="$EXISTING_MODEL_ID" # Ensure it's deployed STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" \ | sed -n 's/.*"model_state":"\([^"]*\)".*/\1/p') if [ "$STATE" != "DEPLOYED" ]; then echo "Model not deployed (state=$STATE), deploying..." curl -sf -X POST "$OS/_plugins/_ml/models/$MODEL_ID/_deploy" || true for i in $(seq 1 30); do STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" \ | sed -n 's/.*"model_state":"\([^"]*\)".*/\1/p') echo " state: $STATE" if [ "$STATE" = "DEPLOYED" ]; then break; fi sleep 5 done fi else # 1. Create model group (idempotent — OpenSearch deduplicates by name) echo "Creating model group..." GROUP_ID=$(curl -sf -X POST "$OS/_plugins/_ml/model_groups/_register" \ -H 'Content-Type: application/json' \ -d '{"name":"sunbeam-embeddings","description":"Sunbeam embedding models"}' \ | sed -n 's/.*"model_group_id":"\([^"]*\)".*/\1/p') echo "Model group: $GROUP_ID" # 2. Register BGE-large-en-v1.5 model from SeaweedFS bucket echo "Registering BGE-large-en-v1.5 model..." TASK_ID=$(curl -sf -X POST "$OS/_plugins/_ml/models/_register" \ -H 'Content-Type: application/json' \ -d "{ \"name\": \"bge-large-en-v1.5\", \"version\": \"1.0.0\", \"model_format\": \"ONNX\", \"model_group_id\": \"$GROUP_ID\", \"model_config\": { \"model_type\": \"bert\", \"embedding_dimension\": 1024, \"framework_type\": \"sentence_transformers\", \"all_config\": \"{\\\"_name_or_path\\\":\\\"BAAI/bge-large-en-v1.5\\\",\\\"model_type\\\":\\\"bert\\\"}\" }, \"url\": \"http://seaweedfs-filer.storage.svc.cluster.local:8333/sunbeam-ml-models/bge-large-en-v1.5.zip\" }" \ | sed -n 's/.*"task_id":"\([^"]*\)".*/\1/p') echo "Registration task: $TASK_ID" # Wait for model registration echo "Waiting for model registration..." for i in $(seq 1 60); do STATUS=$(curl -sf "$OS/_plugins/_ml/tasks/$TASK_ID" \ | sed -n 's/.*"state":"\([^"]*\)".*/\1/p') echo " status: $STATUS" if [ "$STATUS" = "COMPLETED" ]; then break; fi if [ "$STATUS" = "FAILED" ]; then echo "Model registration failed!" curl -sf "$OS/_plugins/_ml/tasks/$TASK_ID" exit 1 fi sleep 10 done MODEL_ID=$(curl -sf "$OS/_plugins/_ml/tasks/$TASK_ID" \ | sed -n 's/.*"model_id":"\([^"]*\)".*/\1/p') echo "Model ID: $MODEL_ID" # 3. Deploy the model echo "Deploying model..." curl -sf -X POST "$OS/_plugins/_ml/models/$MODEL_ID/_deploy" echo "Waiting for model deployment..." for i in $(seq 1 30); do STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" \ | sed -n 's/.*"model_state":"\([^"]*\)".*/\1/p') echo " state: $STATE" if [ "$STATE" = "DEPLOYED" ]; then break; fi sleep 5 done fi # 4. Create/update ingest pipeline (PUT is idempotent) echo "Creating ingest pipeline..." curl -sf -X PUT "$OS/_ingest/pipeline/tuwunel_embedding_pipeline" \ -H 'Content-Type: application/json' \ -d "{ \"description\": \"Tuwunel message embedding pipeline\", \"processors\": [{ \"text_embedding\": { \"model_id\": \"$MODEL_ID\", \"field_map\": { \"body\": \"embedding\" } } }] }" # 5. Create/update search pipeline (PUT is idempotent) echo "Creating search pipeline..." curl -sf -X PUT "$OS/_search/pipeline/tuwunel_hybrid_pipeline" \ -H 'Content-Type: application/json' \ -d '{ "description": "Tuwunel hybrid BM25+neural search pipeline", "phase_results_processors": [{ "normalization-processor": { "normalization": { "technique": "min_max" }, "combination": { "technique": "arithmetic_mean", "parameters": { "weights": [0.3, 0.7] } } } }] }' echo "OpenSearch ML init complete. Model ID: $MODEL_ID" echo "Update tuwunel config search_opensearch_model_id with: $MODEL_ID"