Add OpenSearch search backend with hybrid neural+BM25 support

Extract a SearchBackend trait from the existing RocksDB search code and
add an OpenSearch implementation supporting cross-room search, relevance
ranking, fuzzy matching, English stemming, and optional hybrid
neural+BM25 semantic search using sentence-transformers.

Fix macOS build by gating RLIMIT_NPROC and getrusage to supported
platforms.
This commit is contained in:
2026-03-08 17:41:20 +00:00
parent 9d47ffff05
commit c9cddc80d9
15 changed files with 2328 additions and 196 deletions

View File

@@ -116,6 +116,22 @@ pub fn check(config: &Config) -> Result {
});
}
if config.search_backend == super::SearchBackendConfig::OpenSearch
&& config.search_opensearch_url.is_none()
{
return Err!(Config(
"search_opensearch_url",
"OpenSearch URL must be set when search_backend is \"opensearch\""
));
}
if config.search_opensearch_hybrid && config.search_opensearch_model_id.is_none() {
return Err!(Config(
"search_opensearch_model_id",
"Model ID must be set when search_opensearch_hybrid is enabled"
));
}
// rocksdb does not allow max_log_files to be 0
if config.rocksdb_max_log_files == 0 {
return Err!(Config(

View File

@@ -1100,6 +1100,85 @@ pub struct Config {
#[serde(default)]
pub auto_deactivate_banned_room_attempts: bool,
/// Search backend to use for full-text message search.
///
/// Available options: "rocksdb" (default) or "opensearch".
///
/// default: "rocksdb"
#[serde(default)]
pub search_backend: SearchBackendConfig,
/// URL of the OpenSearch instance. Required when search_backend is
/// "opensearch".
///
/// example: "http://localhost:9200"
pub search_opensearch_url: Option<Url>,
/// Name of the OpenSearch index for message search.
///
/// default: "tuwunel_messages"
#[serde(default = "default_search_opensearch_index")]
pub search_opensearch_index: String,
/// Authentication for OpenSearch in "user:pass" format.
///
/// display: sensitive
pub search_opensearch_auth: Option<String>,
/// Maximum number of documents to batch before flushing to OpenSearch.
///
/// default: 100
#[serde(default = "default_search_opensearch_batch_size")]
pub search_opensearch_batch_size: usize,
/// Maximum time in milliseconds to wait before flushing a partial batch
/// to OpenSearch.
///
/// default: 1000
#[serde(default = "default_search_opensearch_flush_interval_ms")]
pub search_opensearch_flush_interval_ms: u64,
/// Enable hybrid neural+BM25 search in OpenSearch. Requires an ML model
/// deployed in OpenSearch and an ingest pipeline that populates an
/// "embedding" field.
///
/// When enabled, tuwunel will:
/// - Create the index with a knn_vector "embedding" field
/// - Attach the ingest pipeline (search_opensearch_pipeline) to the index
/// - Use hybrid queries combining BM25 + neural kNN scoring
///
/// For a complete reference on configuring OpenSearch's ML plugin, model
/// registration, and ingest pipeline setup, see the test helpers in
/// `src/service/rooms/search/opensearch.rs` (the `ensure_neural_model`,
/// `ensure_ingest_pipeline`, etc. functions in the `tests` module).
///
/// See also: https://opensearch.org/docs/latest/search-plugins/neural-search/
///
/// default: false
#[serde(default)]
pub search_opensearch_hybrid: bool,
/// The model ID registered in OpenSearch for neural search. Required when
/// search_opensearch_hybrid is enabled.
///
/// example: "aKV84osBBHNT0StI3MBr"
pub search_opensearch_model_id: Option<String>,
/// Embedding dimension for the neural search model. Must match the output
/// dimension of the deployed model. Common values: 384
/// (all-MiniLM-L6-v2), 768 (msmarco-distilbert-base-tas-b).
///
/// default: 384
#[serde(default = "default_search_opensearch_embedding_dim")]
pub search_opensearch_embedding_dim: usize,
/// Name of the ingest pipeline that generates embeddings for the
/// "embedding" field. This pipeline must already exist in OpenSearch.
///
/// default: "tuwunel_embedding_pipeline"
#[serde(default = "default_search_opensearch_pipeline")]
pub search_opensearch_pipeline: String,
/// RocksDB log level. This is not the same as tuwunel's log level. This
/// is the log level for the RocksDB engine/library which show up in your
/// database folder/path as `LOG` files. tuwunel will log RocksDB errors
@@ -2304,6 +2383,14 @@ pub struct Config {
catchall: BTreeMap<String, IgnoredAny>,
}
#[derive(Clone, Copy, Debug, Default, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "lowercase")]
pub enum SearchBackendConfig {
#[default]
RocksDb,
OpenSearch,
}
#[derive(Clone, Debug, Deserialize, Default)]
#[config_example_generator(filename = "tuwunel-example.toml", section = "global.tls")]
pub struct TlsConfig {
@@ -3181,6 +3268,16 @@ impl Config {
fn true_fn() -> bool { true }
fn default_search_opensearch_index() -> String { "tuwunel_messages".to_owned() }
fn default_search_opensearch_batch_size() -> usize { 100 }
fn default_search_opensearch_flush_interval_ms() -> u64 { 1000 }
fn default_search_opensearch_embedding_dim() -> usize { 384 }
fn default_search_opensearch_pipeline() -> String { "tuwunel_embedding_pipeline".to_owned() }
#[cfg(test)]
fn default_server_name() -> OwnedServerName { ruma::owned_server_name!("localhost") }