feat(data): upgrade OpenSearch to v3 with ML Commons for neural search
- Upgrade from OpenSearch 2 to 3 (required for ML Commons pre-trained models) - Rename PLUGINS_SECURITY_DISABLED → DISABLE_SECURITY_PLUGIN (OS3 change) - Enable ML Commons plugin settings for on-data-node inference - Increase memory limits (2Gi) and JVM heap for neural model inference - Add fsGroup security context for volume permissions
This commit is contained in:
@@ -13,6 +13,8 @@ spec:
|
||||
labels:
|
||||
app: opensearch
|
||||
spec:
|
||||
securityContext:
|
||||
fsGroup: 1000
|
||||
initContainers:
|
||||
- name: sysctl
|
||||
image: busybox
|
||||
@@ -21,7 +23,7 @@ spec:
|
||||
privileged: true
|
||||
containers:
|
||||
- name: opensearch
|
||||
image: opensearchproject/opensearch:2
|
||||
image: opensearchproject/opensearch:3
|
||||
ports:
|
||||
- name: http
|
||||
containerPort: 9200
|
||||
@@ -33,14 +35,22 @@ spec:
|
||||
- name: discovery.type
|
||||
value: single-node
|
||||
- name: OPENSEARCH_JAVA_OPTS
|
||||
value: "-Xms512m -Xmx1g"
|
||||
value: "-Xms1g -Xmx1536m"
|
||||
- name: DISABLE_SECURITY_PLUGIN
|
||||
value: "true"
|
||||
- name: plugins.ml_commons.only_run_on_ml_node
|
||||
value: "false"
|
||||
- name: plugins.ml_commons.native_memory_threshold
|
||||
value: "90"
|
||||
- name: plugins.ml_commons.model_access_control_enabled
|
||||
value: "false"
|
||||
- name: plugins.ml_commons.allow_registering_model_via_url
|
||||
value: "true"
|
||||
resources:
|
||||
limits:
|
||||
memory: 1500Mi
|
||||
memory: 2Gi
|
||||
requests:
|
||||
memory: 768Mi
|
||||
memory: 1Gi
|
||||
cpu: 100m
|
||||
volumeMounts:
|
||||
- name: data
|
||||
|
||||
150
base/data/opensearch-ml-init-job.yaml
Normal file
150
base/data/opensearch-ml-init-job.yaml
Normal file
@@ -0,0 +1,150 @@
|
||||
apiVersion: batch/v1
|
||||
kind: Job
|
||||
metadata:
|
||||
name: opensearch-ml-init
|
||||
namespace: data
|
||||
annotations:
|
||||
helm.sh/hook: post-install
|
||||
spec:
|
||||
template:
|
||||
spec:
|
||||
restartPolicy: OnFailure
|
||||
containers:
|
||||
- name: init
|
||||
image: curlimages/curl:latest
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
set -e
|
||||
OS=http://opensearch.data.svc.cluster.local:9200
|
||||
|
||||
echo "Waiting for OpenSearch..."
|
||||
until curl -sf "$OS/_cluster/health" >/dev/null 2>&1; do
|
||||
sleep 5
|
||||
done
|
||||
echo "OpenSearch is ready."
|
||||
|
||||
# --- Idempotent: check if model already exists before registering ---
|
||||
|
||||
# Check for existing deployed model by name
|
||||
EXISTING_MODEL_ID=$(curl -sf -X POST "$OS/_plugins/_ml/models/_search" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"query":{"bool":{"must":[{"term":{"name":"bge-large-en-v1.5"}}]}}}' \
|
||||
| sed -n 's/.*"_id":"\([^"]*\)".*/\1/p' | head -1)
|
||||
|
||||
if [ -n "$EXISTING_MODEL_ID" ]; then
|
||||
echo "Model already registered: $EXISTING_MODEL_ID"
|
||||
MODEL_ID="$EXISTING_MODEL_ID"
|
||||
|
||||
# Ensure it's deployed
|
||||
STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" \
|
||||
| sed -n 's/.*"model_state":"\([^"]*\)".*/\1/p')
|
||||
if [ "$STATE" != "DEPLOYED" ]; then
|
||||
echo "Model not deployed (state=$STATE), deploying..."
|
||||
curl -sf -X POST "$OS/_plugins/_ml/models/$MODEL_ID/_deploy" || true
|
||||
for i in $(seq 1 30); do
|
||||
STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" \
|
||||
| sed -n 's/.*"model_state":"\([^"]*\)".*/\1/p')
|
||||
echo " state: $STATE"
|
||||
if [ "$STATE" = "DEPLOYED" ]; then break; fi
|
||||
sleep 5
|
||||
done
|
||||
fi
|
||||
else
|
||||
# 1. Create model group (idempotent — OpenSearch deduplicates by name)
|
||||
echo "Creating model group..."
|
||||
GROUP_ID=$(curl -sf -X POST "$OS/_plugins/_ml/model_groups/_register" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{"name":"sunbeam-embeddings","description":"Sunbeam embedding models"}' \
|
||||
| sed -n 's/.*"model_group_id":"\([^"]*\)".*/\1/p')
|
||||
echo "Model group: $GROUP_ID"
|
||||
|
||||
# 2. Register BGE-large-en-v1.5 model from SeaweedFS bucket
|
||||
echo "Registering BGE-large-en-v1.5 model..."
|
||||
TASK_ID=$(curl -sf -X POST "$OS/_plugins/_ml/models/_register" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d "{
|
||||
\"name\": \"bge-large-en-v1.5\",
|
||||
\"version\": \"1.0.0\",
|
||||
\"model_format\": \"ONNX\",
|
||||
\"model_group_id\": \"$GROUP_ID\",
|
||||
\"model_config\": {
|
||||
\"model_type\": \"bert\",
|
||||
\"embedding_dimension\": 1024,
|
||||
\"framework_type\": \"sentence_transformers\",
|
||||
\"all_config\": \"{\\\"_name_or_path\\\":\\\"BAAI/bge-large-en-v1.5\\\",\\\"model_type\\\":\\\"bert\\\"}\"
|
||||
},
|
||||
\"url\": \"http://seaweedfs-filer.storage.svc.cluster.local:8333/sunbeam-ml-models/bge-large-en-v1.5.zip\"
|
||||
}" \
|
||||
| sed -n 's/.*"task_id":"\([^"]*\)".*/\1/p')
|
||||
echo "Registration task: $TASK_ID"
|
||||
|
||||
# Wait for model registration
|
||||
echo "Waiting for model registration..."
|
||||
for i in $(seq 1 60); do
|
||||
STATUS=$(curl -sf "$OS/_plugins/_ml/tasks/$TASK_ID" \
|
||||
| sed -n 's/.*"state":"\([^"]*\)".*/\1/p')
|
||||
echo " status: $STATUS"
|
||||
if [ "$STATUS" = "COMPLETED" ]; then break; fi
|
||||
if [ "$STATUS" = "FAILED" ]; then
|
||||
echo "Model registration failed!"
|
||||
curl -sf "$OS/_plugins/_ml/tasks/$TASK_ID"
|
||||
exit 1
|
||||
fi
|
||||
sleep 10
|
||||
done
|
||||
|
||||
MODEL_ID=$(curl -sf "$OS/_plugins/_ml/tasks/$TASK_ID" \
|
||||
| sed -n 's/.*"model_id":"\([^"]*\)".*/\1/p')
|
||||
echo "Model ID: $MODEL_ID"
|
||||
|
||||
# 3. Deploy the model
|
||||
echo "Deploying model..."
|
||||
curl -sf -X POST "$OS/_plugins/_ml/models/$MODEL_ID/_deploy"
|
||||
|
||||
echo "Waiting for model deployment..."
|
||||
for i in $(seq 1 30); do
|
||||
STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" \
|
||||
| sed -n 's/.*"model_state":"\([^"]*\)".*/\1/p')
|
||||
echo " state: $STATE"
|
||||
if [ "$STATE" = "DEPLOYED" ]; then break; fi
|
||||
sleep 5
|
||||
done
|
||||
fi
|
||||
|
||||
# 4. Create/update ingest pipeline (PUT is idempotent)
|
||||
echo "Creating ingest pipeline..."
|
||||
curl -sf -X PUT "$OS/_ingest/pipeline/tuwunel_embedding_pipeline" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d "{
|
||||
\"description\": \"Tuwunel message embedding pipeline\",
|
||||
\"processors\": [{
|
||||
\"text_embedding\": {
|
||||
\"model_id\": \"$MODEL_ID\",
|
||||
\"field_map\": {
|
||||
\"body\": \"embedding\"
|
||||
}
|
||||
}
|
||||
}]
|
||||
}"
|
||||
|
||||
# 5. Create/update search pipeline (PUT is idempotent)
|
||||
echo "Creating search pipeline..."
|
||||
curl -sf -X PUT "$OS/_search/pipeline/tuwunel_hybrid_pipeline" \
|
||||
-H 'Content-Type: application/json' \
|
||||
-d '{
|
||||
"description": "Tuwunel hybrid BM25+neural search pipeline",
|
||||
"phase_results_processors": [{
|
||||
"normalization-processor": {
|
||||
"normalization": { "technique": "min_max" },
|
||||
"combination": {
|
||||
"technique": "arithmetic_mean",
|
||||
"parameters": { "weights": [0.3, 0.7] }
|
||||
}
|
||||
}
|
||||
}]
|
||||
}'
|
||||
|
||||
echo "OpenSearch ML init complete. Model ID: $MODEL_ID"
|
||||
echo "Update tuwunel config search_opensearch_model_id with: $MODEL_ID"
|
||||
Reference in New Issue
Block a user