Files
sbbb/base/data/opensearch-ml-init-job.yaml
Sienna Meridian Satterwhite 584e98316b feat(data): upgrade OpenSearch to v3 with ML Commons for neural search
- Upgrade from OpenSearch 2 to 3 (required for ML Commons pre-trained models)
- Rename PLUGINS_SECURITY_DISABLED → DISABLE_SECURITY_PLUGIN (OS3 change)
- Enable ML Commons plugin settings for on-data-node inference
- Increase memory limits (2Gi) and JVM heap for neural model inference
- Add fsGroup security context for volume permissions
2026-03-10 18:52:29 +00:00

151 lines
6.5 KiB
YAML

apiVersion: batch/v1
kind: Job
metadata:
name: opensearch-ml-init
namespace: data
annotations:
helm.sh/hook: post-install
spec:
template:
spec:
restartPolicy: OnFailure
containers:
- name: init
image: curlimages/curl:latest
command:
- /bin/sh
- -c
- |
set -e
OS=http://opensearch.data.svc.cluster.local:9200
echo "Waiting for OpenSearch..."
until curl -sf "$OS/_cluster/health" >/dev/null 2>&1; do
sleep 5
done
echo "OpenSearch is ready."
# --- Idempotent: check if model already exists before registering ---
# Check for existing deployed model by name
EXISTING_MODEL_ID=$(curl -sf -X POST "$OS/_plugins/_ml/models/_search" \
-H 'Content-Type: application/json' \
-d '{"query":{"bool":{"must":[{"term":{"name":"bge-large-en-v1.5"}}]}}}' \
| sed -n 's/.*"_id":"\([^"]*\)".*/\1/p' | head -1)
if [ -n "$EXISTING_MODEL_ID" ]; then
echo "Model already registered: $EXISTING_MODEL_ID"
MODEL_ID="$EXISTING_MODEL_ID"
# Ensure it's deployed
STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" \
| sed -n 's/.*"model_state":"\([^"]*\)".*/\1/p')
if [ "$STATE" != "DEPLOYED" ]; then
echo "Model not deployed (state=$STATE), deploying..."
curl -sf -X POST "$OS/_plugins/_ml/models/$MODEL_ID/_deploy" || true
for i in $(seq 1 30); do
STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" \
| sed -n 's/.*"model_state":"\([^"]*\)".*/\1/p')
echo " state: $STATE"
if [ "$STATE" = "DEPLOYED" ]; then break; fi
sleep 5
done
fi
else
# 1. Create model group (idempotent — OpenSearch deduplicates by name)
echo "Creating model group..."
GROUP_ID=$(curl -sf -X POST "$OS/_plugins/_ml/model_groups/_register" \
-H 'Content-Type: application/json' \
-d '{"name":"sunbeam-embeddings","description":"Sunbeam embedding models"}' \
| sed -n 's/.*"model_group_id":"\([^"]*\)".*/\1/p')
echo "Model group: $GROUP_ID"
# 2. Register BGE-large-en-v1.5 model from SeaweedFS bucket
echo "Registering BGE-large-en-v1.5 model..."
TASK_ID=$(curl -sf -X POST "$OS/_plugins/_ml/models/_register" \
-H 'Content-Type: application/json' \
-d "{
\"name\": \"bge-large-en-v1.5\",
\"version\": \"1.0.0\",
\"model_format\": \"ONNX\",
\"model_group_id\": \"$GROUP_ID\",
\"model_config\": {
\"model_type\": \"bert\",
\"embedding_dimension\": 1024,
\"framework_type\": \"sentence_transformers\",
\"all_config\": \"{\\\"_name_or_path\\\":\\\"BAAI/bge-large-en-v1.5\\\",\\\"model_type\\\":\\\"bert\\\"}\"
},
\"url\": \"http://seaweedfs-filer.storage.svc.cluster.local:8333/sunbeam-ml-models/bge-large-en-v1.5.zip\"
}" \
| sed -n 's/.*"task_id":"\([^"]*\)".*/\1/p')
echo "Registration task: $TASK_ID"
# Wait for model registration
echo "Waiting for model registration..."
for i in $(seq 1 60); do
STATUS=$(curl -sf "$OS/_plugins/_ml/tasks/$TASK_ID" \
| sed -n 's/.*"state":"\([^"]*\)".*/\1/p')
echo " status: $STATUS"
if [ "$STATUS" = "COMPLETED" ]; then break; fi
if [ "$STATUS" = "FAILED" ]; then
echo "Model registration failed!"
curl -sf "$OS/_plugins/_ml/tasks/$TASK_ID"
exit 1
fi
sleep 10
done
MODEL_ID=$(curl -sf "$OS/_plugins/_ml/tasks/$TASK_ID" \
| sed -n 's/.*"model_id":"\([^"]*\)".*/\1/p')
echo "Model ID: $MODEL_ID"
# 3. Deploy the model
echo "Deploying model..."
curl -sf -X POST "$OS/_plugins/_ml/models/$MODEL_ID/_deploy"
echo "Waiting for model deployment..."
for i in $(seq 1 30); do
STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" \
| sed -n 's/.*"model_state":"\([^"]*\)".*/\1/p')
echo " state: $STATE"
if [ "$STATE" = "DEPLOYED" ]; then break; fi
sleep 5
done
fi
# 4. Create/update ingest pipeline (PUT is idempotent)
echo "Creating ingest pipeline..."
curl -sf -X PUT "$OS/_ingest/pipeline/tuwunel_embedding_pipeline" \
-H 'Content-Type: application/json' \
-d "{
\"description\": \"Tuwunel message embedding pipeline\",
\"processors\": [{
\"text_embedding\": {
\"model_id\": \"$MODEL_ID\",
\"field_map\": {
\"body\": \"embedding\"
}
}
}]
}"
# 5. Create/update search pipeline (PUT is idempotent)
echo "Creating search pipeline..."
curl -sf -X PUT "$OS/_search/pipeline/tuwunel_hybrid_pipeline" \
-H 'Content-Type: application/json' \
-d '{
"description": "Tuwunel hybrid BM25+neural search pipeline",
"phase_results_processors": [{
"normalization-processor": {
"normalization": { "technique": "min_max" },
"combination": {
"technique": "arithmetic_mean",
"parameters": { "weights": [0.3, 0.7] }
}
}
}]
}'
echo "OpenSearch ML init complete. Model ID: $MODEL_ID"
echo "Update tuwunel config search_opensearch_model_id with: $MODEL_ID"