Add OpenSearch search backend with hybrid neural+BM25 support

Extract a SearchBackend trait from the existing RocksDB search code and add an OpenSearch implementation supporting cross-room search, relevance ranking, fuzzy matching, English stemming, and optional hybrid neural+BM25 semantic search using sentence-transformers. Fix macOS build by gating RLIMIT_NPROC and getrusage to supported platforms.
2026-03-08 17:41:20 +00:00
parent 9d47ffff05
commit c9cddc80d9
15 changed files with 2328 additions and 196 deletions
--- a/tuwunel-example.toml
+++ b/tuwunel-example.toml
@@ -907,6 +907,72 @@
 #
 #auto_deactivate_banned_room_attempts = false

+# Search backend to use for full-text message search.
+#
+# Available options: "rocksdb" (default) or "opensearch".
+#
+#search_backend = "rocksdb"
+
+# URL of the OpenSearch instance. Required when search_backend is
+# "opensearch".
+#
+# example: "http://localhost:9200"
+#
+#search_opensearch_url =
+
+# Name of the OpenSearch index for message search.
+#
+#search_opensearch_index = "tuwunel_messages"
+
+# Authentication for OpenSearch in "user:pass" format.
+#
+#search_opensearch_auth =
+
+# Maximum number of documents to batch before flushing to OpenSearch.
+#
+#search_opensearch_batch_size = 100
+
+# Maximum time in milliseconds to wait before flushing a partial batch
+# to OpenSearch.
+#
+#search_opensearch_flush_interval_ms = 1000
+
+# Enable hybrid neural+BM25 search in OpenSearch. Requires an ML model
+# deployed in OpenSearch and an ingest pipeline that populates an
+# "embedding" field.
+#
+# When enabled, tuwunel will:
+# - Create the index with a knn_vector "embedding" field
+# - Attach the ingest pipeline (search_opensearch_pipeline) to the index
+# - Use hybrid queries combining BM25 + neural kNN scoring
+#
+# For a complete reference on configuring OpenSearch's ML plugin, model
+# registration, and ingest pipeline setup, see the test helpers in
+# `src/service/rooms/search/opensearch.rs` (the `ensure_neural_model`,
+# `ensure_ingest_pipeline`, etc. functions in the `tests` module).
+#
+# See also: https://opensearch.org/docs/latest/search-plugins/neural-search/
+#
+#search_opensearch_hybrid = false
+
+# The model ID registered in OpenSearch for neural search. Required when
+# search_opensearch_hybrid is enabled.
+#
+# example: "aKV84osBBHNT0StI3MBr"
+#
+#search_opensearch_model_id =
+
+# Embedding dimension for the neural search model. Must match the output
+# dimension of the deployed model. Common values: 384
+# (all-MiniLM-L6-v2), 768 (msmarco-distilbert-base-tas-b).
+#
+#search_opensearch_embedding_dim = 384
+
+# Name of the ingest pipeline that generates embeddings for the
+# "embedding" field. This pipeline must already exist in OpenSearch.
+#
+#search_opensearch_pipeline = "tuwunel_embedding_pipeline"
+
 # RocksDB log level. This is not the same as tuwunel's log level. This
 # is the log level for the RocksDB engine/library which show up in your
 # database folder/path as `LOG` files. tuwunel will log RocksDB errors