Add OpenSearch search backend with hybrid neural+BM25 support
Extract a SearchBackend trait from the existing RocksDB search code and add an OpenSearch implementation supporting cross-room search, relevance ranking, fuzzy matching, English stemming, and optional hybrid neural+BM25 semantic search using sentence-transformers. Fix macOS build by gating RLIMIT_NPROC and getrusage to supported platforms.
This commit is contained in:
@@ -116,6 +116,22 @@ pub fn check(config: &Config) -> Result {
|
||||
});
|
||||
}
|
||||
|
||||
if config.search_backend == super::SearchBackendConfig::OpenSearch
|
||||
&& config.search_opensearch_url.is_none()
|
||||
{
|
||||
return Err!(Config(
|
||||
"search_opensearch_url",
|
||||
"OpenSearch URL must be set when search_backend is \"opensearch\""
|
||||
));
|
||||
}
|
||||
|
||||
if config.search_opensearch_hybrid && config.search_opensearch_model_id.is_none() {
|
||||
return Err!(Config(
|
||||
"search_opensearch_model_id",
|
||||
"Model ID must be set when search_opensearch_hybrid is enabled"
|
||||
));
|
||||
}
|
||||
|
||||
// rocksdb does not allow max_log_files to be 0
|
||||
if config.rocksdb_max_log_files == 0 {
|
||||
return Err!(Config(
|
||||
|
||||
@@ -1100,6 +1100,85 @@ pub struct Config {
|
||||
#[serde(default)]
|
||||
pub auto_deactivate_banned_room_attempts: bool,
|
||||
|
||||
/// Search backend to use for full-text message search.
|
||||
///
|
||||
/// Available options: "rocksdb" (default) or "opensearch".
|
||||
///
|
||||
/// default: "rocksdb"
|
||||
#[serde(default)]
|
||||
pub search_backend: SearchBackendConfig,
|
||||
|
||||
/// URL of the OpenSearch instance. Required when search_backend is
|
||||
/// "opensearch".
|
||||
///
|
||||
/// example: "http://localhost:9200"
|
||||
pub search_opensearch_url: Option<Url>,
|
||||
|
||||
/// Name of the OpenSearch index for message search.
|
||||
///
|
||||
/// default: "tuwunel_messages"
|
||||
#[serde(default = "default_search_opensearch_index")]
|
||||
pub search_opensearch_index: String,
|
||||
|
||||
/// Authentication for OpenSearch in "user:pass" format.
|
||||
///
|
||||
/// display: sensitive
|
||||
pub search_opensearch_auth: Option<String>,
|
||||
|
||||
/// Maximum number of documents to batch before flushing to OpenSearch.
|
||||
///
|
||||
/// default: 100
|
||||
#[serde(default = "default_search_opensearch_batch_size")]
|
||||
pub search_opensearch_batch_size: usize,
|
||||
|
||||
/// Maximum time in milliseconds to wait before flushing a partial batch
|
||||
/// to OpenSearch.
|
||||
///
|
||||
/// default: 1000
|
||||
#[serde(default = "default_search_opensearch_flush_interval_ms")]
|
||||
pub search_opensearch_flush_interval_ms: u64,
|
||||
|
||||
/// Enable hybrid neural+BM25 search in OpenSearch. Requires an ML model
|
||||
/// deployed in OpenSearch and an ingest pipeline that populates an
|
||||
/// "embedding" field.
|
||||
///
|
||||
/// When enabled, tuwunel will:
|
||||
/// - Create the index with a knn_vector "embedding" field
|
||||
/// - Attach the ingest pipeline (search_opensearch_pipeline) to the index
|
||||
/// - Use hybrid queries combining BM25 + neural kNN scoring
|
||||
///
|
||||
/// For a complete reference on configuring OpenSearch's ML plugin, model
|
||||
/// registration, and ingest pipeline setup, see the test helpers in
|
||||
/// `src/service/rooms/search/opensearch.rs` (the `ensure_neural_model`,
|
||||
/// `ensure_ingest_pipeline`, etc. functions in the `tests` module).
|
||||
///
|
||||
/// See also: https://opensearch.org/docs/latest/search-plugins/neural-search/
|
||||
///
|
||||
/// default: false
|
||||
#[serde(default)]
|
||||
pub search_opensearch_hybrid: bool,
|
||||
|
||||
/// The model ID registered in OpenSearch for neural search. Required when
|
||||
/// search_opensearch_hybrid is enabled.
|
||||
///
|
||||
/// example: "aKV84osBBHNT0StI3MBr"
|
||||
pub search_opensearch_model_id: Option<String>,
|
||||
|
||||
/// Embedding dimension for the neural search model. Must match the output
|
||||
/// dimension of the deployed model. Common values: 384
|
||||
/// (all-MiniLM-L6-v2), 768 (msmarco-distilbert-base-tas-b).
|
||||
///
|
||||
/// default: 384
|
||||
#[serde(default = "default_search_opensearch_embedding_dim")]
|
||||
pub search_opensearch_embedding_dim: usize,
|
||||
|
||||
/// Name of the ingest pipeline that generates embeddings for the
|
||||
/// "embedding" field. This pipeline must already exist in OpenSearch.
|
||||
///
|
||||
/// default: "tuwunel_embedding_pipeline"
|
||||
#[serde(default = "default_search_opensearch_pipeline")]
|
||||
pub search_opensearch_pipeline: String,
|
||||
|
||||
/// RocksDB log level. This is not the same as tuwunel's log level. This
|
||||
/// is the log level for the RocksDB engine/library which show up in your
|
||||
/// database folder/path as `LOG` files. tuwunel will log RocksDB errors
|
||||
@@ -2304,6 +2383,14 @@ pub struct Config {
|
||||
catchall: BTreeMap<String, IgnoredAny>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, Default, Deserialize, PartialEq, Eq)]
|
||||
#[serde(rename_all = "lowercase")]
|
||||
pub enum SearchBackendConfig {
|
||||
#[default]
|
||||
RocksDb,
|
||||
OpenSearch,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Deserialize, Default)]
|
||||
#[config_example_generator(filename = "tuwunel-example.toml", section = "global.tls")]
|
||||
pub struct TlsConfig {
|
||||
@@ -3181,6 +3268,16 @@ impl Config {
|
||||
|
||||
fn true_fn() -> bool { true }
|
||||
|
||||
fn default_search_opensearch_index() -> String { "tuwunel_messages".to_owned() }
|
||||
|
||||
fn default_search_opensearch_batch_size() -> usize { 100 }
|
||||
|
||||
fn default_search_opensearch_flush_interval_ms() -> u64 { 1000 }
|
||||
|
||||
fn default_search_opensearch_embedding_dim() -> usize { 384 }
|
||||
|
||||
fn default_search_opensearch_pipeline() -> String { "tuwunel_embedding_pipeline".to_owned() }
|
||||
|
||||
#[cfg(test)]
|
||||
fn default_server_name() -> OwnedServerName { ruma::owned_server_name!("localhost") }
|
||||
|
||||
|
||||
Reference in New Issue
Block a user