- Upgrade from OpenSearch 2 to 3 (required for ML Commons pre-trained models) - Rename PLUGINS_SECURITY_DISABLED → DISABLE_SECURITY_PLUGIN (OS3 change) - Enable ML Commons plugin settings for on-data-node inference - Increase memory limits (2Gi) and JVM heap for neural model inference - Add fsGroup security context for volume permissions
151 lines
6.5 KiB
YAML
151 lines
6.5 KiB
YAML
apiVersion: batch/v1
|
|
kind: Job
|
|
metadata:
|
|
name: opensearch-ml-init
|
|
namespace: data
|
|
annotations:
|
|
helm.sh/hook: post-install
|
|
spec:
|
|
template:
|
|
spec:
|
|
restartPolicy: OnFailure
|
|
containers:
|
|
- name: init
|
|
image: curlimages/curl:latest
|
|
command:
|
|
- /bin/sh
|
|
- -c
|
|
- |
|
|
set -e
|
|
OS=http://opensearch.data.svc.cluster.local:9200
|
|
|
|
echo "Waiting for OpenSearch..."
|
|
until curl -sf "$OS/_cluster/health" >/dev/null 2>&1; do
|
|
sleep 5
|
|
done
|
|
echo "OpenSearch is ready."
|
|
|
|
# --- Idempotent: check if model already exists before registering ---
|
|
|
|
# Check for existing deployed model by name
|
|
EXISTING_MODEL_ID=$(curl -sf -X POST "$OS/_plugins/_ml/models/_search" \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"query":{"bool":{"must":[{"term":{"name":"bge-large-en-v1.5"}}]}}}' \
|
|
| sed -n 's/.*"_id":"\([^"]*\)".*/\1/p' | head -1)
|
|
|
|
if [ -n "$EXISTING_MODEL_ID" ]; then
|
|
echo "Model already registered: $EXISTING_MODEL_ID"
|
|
MODEL_ID="$EXISTING_MODEL_ID"
|
|
|
|
# Ensure it's deployed
|
|
STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" \
|
|
| sed -n 's/.*"model_state":"\([^"]*\)".*/\1/p')
|
|
if [ "$STATE" != "DEPLOYED" ]; then
|
|
echo "Model not deployed (state=$STATE), deploying..."
|
|
curl -sf -X POST "$OS/_plugins/_ml/models/$MODEL_ID/_deploy" || true
|
|
for i in $(seq 1 30); do
|
|
STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" \
|
|
| sed -n 's/.*"model_state":"\([^"]*\)".*/\1/p')
|
|
echo " state: $STATE"
|
|
if [ "$STATE" = "DEPLOYED" ]; then break; fi
|
|
sleep 5
|
|
done
|
|
fi
|
|
else
|
|
# 1. Create model group (idempotent — OpenSearch deduplicates by name)
|
|
echo "Creating model group..."
|
|
GROUP_ID=$(curl -sf -X POST "$OS/_plugins/_ml/model_groups/_register" \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{"name":"sunbeam-embeddings","description":"Sunbeam embedding models"}' \
|
|
| sed -n 's/.*"model_group_id":"\([^"]*\)".*/\1/p')
|
|
echo "Model group: $GROUP_ID"
|
|
|
|
# 2. Register BGE-large-en-v1.5 model from SeaweedFS bucket
|
|
echo "Registering BGE-large-en-v1.5 model..."
|
|
TASK_ID=$(curl -sf -X POST "$OS/_plugins/_ml/models/_register" \
|
|
-H 'Content-Type: application/json' \
|
|
-d "{
|
|
\"name\": \"bge-large-en-v1.5\",
|
|
\"version\": \"1.0.0\",
|
|
\"model_format\": \"ONNX\",
|
|
\"model_group_id\": \"$GROUP_ID\",
|
|
\"model_config\": {
|
|
\"model_type\": \"bert\",
|
|
\"embedding_dimension\": 1024,
|
|
\"framework_type\": \"sentence_transformers\",
|
|
\"all_config\": \"{\\\"_name_or_path\\\":\\\"BAAI/bge-large-en-v1.5\\\",\\\"model_type\\\":\\\"bert\\\"}\"
|
|
},
|
|
\"url\": \"http://seaweedfs-filer.storage.svc.cluster.local:8333/sunbeam-ml-models/bge-large-en-v1.5.zip\"
|
|
}" \
|
|
| sed -n 's/.*"task_id":"\([^"]*\)".*/\1/p')
|
|
echo "Registration task: $TASK_ID"
|
|
|
|
# Wait for model registration
|
|
echo "Waiting for model registration..."
|
|
for i in $(seq 1 60); do
|
|
STATUS=$(curl -sf "$OS/_plugins/_ml/tasks/$TASK_ID" \
|
|
| sed -n 's/.*"state":"\([^"]*\)".*/\1/p')
|
|
echo " status: $STATUS"
|
|
if [ "$STATUS" = "COMPLETED" ]; then break; fi
|
|
if [ "$STATUS" = "FAILED" ]; then
|
|
echo "Model registration failed!"
|
|
curl -sf "$OS/_plugins/_ml/tasks/$TASK_ID"
|
|
exit 1
|
|
fi
|
|
sleep 10
|
|
done
|
|
|
|
MODEL_ID=$(curl -sf "$OS/_plugins/_ml/tasks/$TASK_ID" \
|
|
| sed -n 's/.*"model_id":"\([^"]*\)".*/\1/p')
|
|
echo "Model ID: $MODEL_ID"
|
|
|
|
# 3. Deploy the model
|
|
echo "Deploying model..."
|
|
curl -sf -X POST "$OS/_plugins/_ml/models/$MODEL_ID/_deploy"
|
|
|
|
echo "Waiting for model deployment..."
|
|
for i in $(seq 1 30); do
|
|
STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" \
|
|
| sed -n 's/.*"model_state":"\([^"]*\)".*/\1/p')
|
|
echo " state: $STATE"
|
|
if [ "$STATE" = "DEPLOYED" ]; then break; fi
|
|
sleep 5
|
|
done
|
|
fi
|
|
|
|
# 4. Create/update ingest pipeline (PUT is idempotent)
|
|
echo "Creating ingest pipeline..."
|
|
curl -sf -X PUT "$OS/_ingest/pipeline/tuwunel_embedding_pipeline" \
|
|
-H 'Content-Type: application/json' \
|
|
-d "{
|
|
\"description\": \"Tuwunel message embedding pipeline\",
|
|
\"processors\": [{
|
|
\"text_embedding\": {
|
|
\"model_id\": \"$MODEL_ID\",
|
|
\"field_map\": {
|
|
\"body\": \"embedding\"
|
|
}
|
|
}
|
|
}]
|
|
}"
|
|
|
|
# 5. Create/update search pipeline (PUT is idempotent)
|
|
echo "Creating search pipeline..."
|
|
curl -sf -X PUT "$OS/_search/pipeline/tuwunel_hybrid_pipeline" \
|
|
-H 'Content-Type: application/json' \
|
|
-d '{
|
|
"description": "Tuwunel hybrid BM25+neural search pipeline",
|
|
"phase_results_processors": [{
|
|
"normalization-processor": {
|
|
"normalization": { "technique": "min_max" },
|
|
"combination": {
|
|
"technique": "arithmetic_mean",
|
|
"parameters": { "weights": [0.3, 0.7] }
|
|
}
|
|
}
|
|
}]
|
|
}'
|
|
|
|
echo "OpenSearch ML init complete. Model ID: $MODEL_ID"
|
|
echo "Update tuwunel config search_opensearch_model_id with: $MODEL_ID"
|