sol/dev/opensearch-init.sh

#!/bin/bash
## Initialize OpenSearch ML pipelines for local dev.
## Mirrors production: all-mpnet-base-v2 (768-dim), same pipelines.
##
## Run after `docker compose -f docker-compose.dev.yaml up -d`

set -euo pipefail

OS="http://localhost:9200"

echo "Waiting for OpenSearch..."
until curl -sf "$OS/_cluster/health" >/dev/null 2>&1; do
    sleep 2
done
echo "OpenSearch is ready."

# --- Configure ML Commons (matches production persistent settings) ---
echo "Configuring ML Commons..."
curl -sf -X PUT "$OS/_cluster/settings" \
  -H 'Content-Type: application/json' \
  -d '{
    "persistent": {
      "plugins.ml_commons.only_run_on_ml_node": false,
      "plugins.ml_commons.native_memory_threshold": 90,
      "plugins.ml_commons.model_access_control_enabled": false,
      "plugins.ml_commons.allow_registering_model_via_url": true
    }
  }' > /dev/null
echo "Done."

# --- Check for existing deployed model ---
EXISTING=$(curl -sf -X POST "$OS/_plugins/_ml/models/_search" \
  -H 'Content-Type: application/json' \
  -d '{"query":{"bool":{"must":[{"term":{"name":"huggingface/sentence-transformers/all-mpnet-base-v2"}}]}},"size":1}')

MODEL_ID=$(echo "$EXISTING" | python3 -c "
import sys, json
hits = json.load(sys.stdin).get('hits',{}).get('hits',[])
# Find the parent model (not chunks)
for h in hits:
    if '_' not in h['_id'].split('BA6N7')[0][-3:]:  # heuristic
        print(h['_id']); break
" 2>/dev/null || echo "")

# Better: search for deployed/registered models only
if [ -z "$MODEL_ID" ]; then
    MODEL_ID=$(echo "$EXISTING" | python3 -c "
import sys, json
hits = json.load(sys.stdin).get('hits',{}).get('hits',[])
if hits:
    # Get the model_id field from any chunk — they all share it
    mid = hits[0]['_source'].get('model_id', hits[0]['_id'])
    print(mid)
" 2>/dev/null || echo "")
fi

if [ -n "$MODEL_ID" ]; then
    echo "Model already registered: $MODEL_ID"

    STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" 2>/dev/null \
      | python3 -c "import sys,json; print(json.load(sys.stdin).get('model_state','UNKNOWN'))" 2>/dev/null || echo "UNKNOWN")

    if [ "$STATE" = "DEPLOYED" ]; then
        echo "Model already deployed."
    else
        echo "Model state: $STATE — deploying..."
        curl -sf -X POST "$OS/_plugins/_ml/models/$MODEL_ID/_deploy" > /dev/null || true
        for i in $(seq 1 30); do
            STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" \
              | python3 -c "import sys,json; print(json.load(sys.stdin).get('model_state','UNKNOWN'))")
            echo "  state: $STATE"
            if [ "$STATE" = "DEPLOYED" ]; then break; fi
            sleep 5
        done
    fi
else
    # Register all-mpnet-base-v2 via pretrained model API (same as production)
    echo "Registering all-mpnet-base-v2 (pretrained, TORCH_SCRIPT, 768-dim)..."
    TASK_ID=$(curl -sf -X POST "$OS/_plugins/_ml/models/_register" \
      -H 'Content-Type: application/json' \
      -d '{
        "name": "huggingface/sentence-transformers/all-mpnet-base-v2",
        "version": "1.0.1",
        "model_format": "TORCH_SCRIPT"
      }' | python3 -c "import sys,json; print(json.load(sys.stdin).get('task_id',''))")
    echo "Registration task: $TASK_ID"

    echo "Waiting for model download + registration..."
    for i in $(seq 1 90); do
        RESP=$(curl -sf "$OS/_plugins/_ml/tasks/$TASK_ID")
        STATUS=$(echo "$RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('state','UNKNOWN'))")
        echo "  [$i] $STATUS"
        if [ "$STATUS" = "COMPLETED" ]; then
            MODEL_ID=$(echo "$RESP" | python3 -c "import sys,json; print(json.load(sys.stdin).get('model_id',''))")
            break
        fi
        if [ "$STATUS" = "FAILED" ]; then
            echo "Registration failed!"
            echo "$RESP" | python3 -m json.tool
            exit 1
        fi
        sleep 10
    done
    echo "Model ID: $MODEL_ID"

    # Deploy
    echo "Deploying model..."
    curl -sf -X POST "$OS/_plugins/_ml/models/$MODEL_ID/_deploy" > /dev/null

    echo "Waiting for deployment..."
    for i in $(seq 1 30); do
        STATE=$(curl -sf "$OS/_plugins/_ml/models/$MODEL_ID" \
          | python3 -c "import sys,json; print(json.load(sys.stdin).get('model_state','UNKNOWN'))")
        echo "  state: $STATE"
        if [ "$STATE" = "DEPLOYED" ]; then break; fi
        sleep 5
    done
fi

if [ -z "$MODEL_ID" ]; then
    echo "ERROR: No model ID — cannot create pipelines."
    exit 1
fi

echo ""
echo "Model $MODEL_ID deployed."

# --- Create ingest pipeline (matches production exactly) ---
echo "Creating ingest pipeline: tuwunel_embedding_pipeline..."
curl -sf -X PUT "$OS/_ingest/pipeline/tuwunel_embedding_pipeline" \
  -H 'Content-Type: application/json' \
  -d "{
    \"description\": \"Tuwunel message embedding pipeline\",
    \"processors\": [{
      \"text_embedding\": {
        \"model_id\": \"$MODEL_ID\",
        \"field_map\": {
          \"body\": \"embedding\"
        }
      }
    }]
  }" > /dev/null
echo "Done."

# --- Create search pipeline (matches production exactly) ---
echo "Creating search pipeline: tuwunel_hybrid_pipeline..."
curl -sf -X PUT "$OS/_search/pipeline/tuwunel_hybrid_pipeline" \
  -H 'Content-Type: application/json' \
  -d '{
    "description": "Tuwunel hybrid BM25+neural search pipeline",
    "phase_results_processors": [{
      "normalization-processor": {
        "normalization": { "technique": "min_max" },
        "combination": {
          "technique": "arithmetic_mean",
          "parameters": { "weights": [0.3, 0.7] }
        }
      }
    }]
  }' > /dev/null
echo "Done."

echo ""
echo "OpenSearch ML init complete."
echo "  Model: all-mpnet-base-v2 ($MODEL_ID)"
echo "  Ingest pipeline: tuwunel_embedding_pipeline"
echo "  Search pipeline: tuwunel_hybrid_pipeline"