#!/bin/bash ## Initialize OpenSearch ML pipelines for local dev. ## Mirrors production: all-mpnet-base-v2 (768-dim), same pipelines. ## ## Run after `docker compose -f docker-compose.dev.yaml up -d` set -euo pipefail OS="http://localhost:9200" echo "Waiting for OpenSearch..." until curl -sf "$OS/_cluster/health" >/dev/null 2>&1; do sleep 2 done echo "OpenSearch is ready." # --- Configure ML Commons (matches production persistent settings) --- echo "Configuring ML Commons..." curl -sf -X PUT "$OS/_cluster/settings" \ -H 'Content-Type: application/json' \ -d '{ "persistent": { "plugins.ml_commons.only_run_on_ml_node": false, "plugins.ml_commons.native_memory_threshold": 90, "plugins.ml_commons.model_access_control_enabled": false, "plugins.ml_commons.allow_registering_model_via_url": true } }' > /dev/null echo "Done." # --- Check for existing deployed model --- EXISTING=$(curl -sf -X POST "$OS/_plugins/_ml/models/_search" \ -H 'Content-Type: application/json' \ -d '{"query":{"bool":{"must":[{"term":{"name":"huggingface/sentence-transformers/all-mpnet-base-v2"}}]}},"size":1}') MODEL_ID=$(echo "$EXISTING" | python3 -c " import sys, json hits = json.load(sys.stdin).get('hits',{}).get('hits',[]) # Find the parent model (not chunks) for h in hits: if '_' not in h['_id'].split('BA6N7')[0][-3:]: # heuristic print(h['_id']); break " 2>/dev/null || echo "") # Better: search for deployed/registered models only if [ -z "$MODEL_ID" ]; then MODEL_ID=$(echo "$EXISTING" | python3 -c " import sys, json hits = json.load(sys.stdin).get('hits',{}).get('hits',[]) if hits: # Get the model_id field from any chunk — they all share it mid = hits[0]['_source'].get('model_id', hits[0]['_id']) print(mid) " 2>/dev/null || echo "") fi if [ -n "$MODEL_ID" ]; then echo "Model already registered: $MODEL_ID" STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" 2>/dev/null \ | python3 -c "import sys,json; print(json.load(sys.stdin).get('model_state','UNKNOWN'))" 2>/dev/null || echo "UNKNOWN") if [ "$STATE" = "DEPLOYED" ]; then echo "Model already deployed." else echo "Model state: $STATE — deploying..." curl -sf -X POST "$OS/_plugins/_ml/models/$MODEL_ID/_deploy" > /dev/null || true for i in $(seq 1 30); do STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" \ | python3 -c "import sys,json; print(json.load(sys.stdin).get('model_state','UNKNOWN'))") echo " state: $STATE" if [ "$STATE" = "DEPLOYED" ]; then break; fi sleep 5 done fi else # Register all-mpnet-base-v2 via pretrained model API (same as production) echo "Registering all-mpnet-base-v2 (pretrained, TORCH_SCRIPT, 768-dim)..." TASK_ID=$(curl -sf -X POST "$OS/_plugins/_ml/models/_register" \ -H 'Content-Type: application/json' \ -d '{ "name": "huggingface/sentence-transformers/all-mpnet-base-v2", "version": "1.0.1", "model_format": "TORCH_SCRIPT" }' | python3 -c "import sys,json; print(json.load(sys.stdin).get('task_id',''))") echo "Registration task: $TASK_ID" echo "Waiting for model download + registration..." for i in $(seq 1 90); do RESP=$(curl -sf "$OS/_plugins/_ml/tasks/$TASK_ID") STATUS=$(echo "$RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('state','UNKNOWN'))") echo " [$i] $STATUS" if [ "$STATUS" = "COMPLETED" ]; then MODEL_ID=$(echo "$RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('model_id',''))") break fi if [ "$STATUS" = "FAILED" ]; then echo "Registration failed!" echo "$RESP" | python3 -m json.tool exit 1 fi sleep 10 done echo "Model ID: $MODEL_ID" # Deploy echo "Deploying model..." curl -sf -X POST "$OS/_plugins/_ml/models/$MODEL_ID/_deploy" > /dev/null echo "Waiting for deployment..." for i in $(seq 1 30); do STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" \ | python3 -c "import sys,json; print(json.load(sys.stdin).get('model_state','UNKNOWN'))") echo " state: $STATE" if [ "$STATE" = "DEPLOYED" ]; then break; fi sleep 5 done fi if [ -z "$MODEL_ID" ]; then echo "ERROR: No model ID — cannot create pipelines." exit 1 fi echo "" echo "Model $MODEL_ID deployed." # --- Create ingest pipeline (matches production exactly) --- echo "Creating ingest pipeline: tuwunel_embedding_pipeline..." curl -sf -X PUT "$OS/_ingest/pipeline/tuwunel_embedding_pipeline" \ -H 'Content-Type: application/json' \ -d "{ \"description\": \"Tuwunel message embedding pipeline\", \"processors\": [{ \"text_embedding\": { \"model_id\": \"$MODEL_ID\", \"field_map\": { \"body\": \"embedding\" } } }] }" > /dev/null echo "Done." # --- Create search pipeline (matches production exactly) --- echo "Creating search pipeline: tuwunel_hybrid_pipeline..." curl -sf -X PUT "$OS/_search/pipeline/tuwunel_hybrid_pipeline" \ -H 'Content-Type: application/json' \ -d '{ "description": "Tuwunel hybrid BM25+neural search pipeline", "phase_results_processors": [{ "normalization-processor": { "normalization": { "technique": "min_max" }, "combination": { "technique": "arithmetic_mean", "parameters": { "weights": [0.3, 0.7] } } } }] }' > /dev/null echo "Done." echo "" echo "OpenSearch ML init complete." echo " Model: all-mpnet-base-v2 ($MODEL_ID)" echo " Ingest pipeline: tuwunel_embedding_pipeline" echo " Search pipeline: tuwunel_hybrid_pipeline"