feat: initial Sol virtual librarian implementation

Matrix bot with E2EE (matrix-sdk 0.9) that passively archives all messages to OpenSearch and responds to queries via Mistral AI with function calling tools. Core systems: - Archive: bulk OpenSearch indexer with batch/flush, edit/redaction handling, embedding pipeline passthrough - Brain: rule-based engagement evaluator (mentions, DMs, name invocations), LLM-powered spontaneous engagement, per-room conversation context windows, response delay simulation - Tools: search_archive, get_room_context, list_rooms, get_room_members registered as Mistral function calling tools with iterative tool loop - Personality: templated system prompt with Sol's librarian persona 47 unit tests covering config, evaluator, conversation windowing, personality templates, schema serialization, and search query building.
2026-03-20 21:40:13 +00:00
commit 4dc20bee23
21 changed files with 6934 additions and 0 deletions
--- a/src/archive/indexer.rs
+++ b/src/archive/indexer.rs
@@ -0,0 +1,148 @@
+use std::sync::Arc;
+
+use opensearch::http::request::JsonBody;
+use opensearch::OpenSearch;
+use serde_json::json;
+use tokio::sync::Mutex;
+use tokio::time::{interval, Duration};
+use tracing::{debug, error, warn};
+
+use crate::config::Config;
+use super::schema::ArchiveDocument;
+
+pub struct Indexer {
+    buffer: Arc<Mutex<Vec<ArchiveDocument>>>,
+    client: OpenSearch,
+    config: Arc<Config>,
+}
+
+impl Indexer {
+    pub fn new(client: OpenSearch, config: Arc<Config>) -> Self {
+        Self {
+            buffer: Arc::new(Mutex::new(Vec::new())),
+            client,
+            config,
+        }
+    }
+
+    pub async fn add(&self, doc: ArchiveDocument) {
+        let mut buffer = self.buffer.lock().await;
+        buffer.push(doc);
+        let batch_size = self.config.opensearch.batch_size;
+        if buffer.len() >= batch_size {
+            let docs: Vec<ArchiveDocument> = buffer.drain(..).collect();
+            drop(buffer);
+            if let Err(e) = self.flush_docs(docs).await {
+                error!("Failed to flush archive batch: {e}");
+            }
+        }
+    }
+
+    pub async fn update_edit(&self, event_id: &str, new_content: &str) {
+        let body = json!({
+            "doc": {
+                "content": new_content,
+                "edited": true
+            }
+        });
+        if let Err(e) = self
+            .client
+            .update(opensearch::UpdateParts::IndexId(
+                &self.config.opensearch.index,
+                event_id,
+            ))
+            .body(body)
+            .send()
+            .await
+        {
+            warn!(event_id, "Failed to update edited message: {e}");
+        }
+    }
+
+    pub async fn update_redaction(&self, event_id: &str) {
+        let body = json!({
+            "doc": {
+                "content": "",
+                "redacted": true
+            }
+        });
+        if let Err(e) = self
+            .client
+            .update(opensearch::UpdateParts::IndexId(
+                &self.config.opensearch.index,
+                event_id,
+            ))
+            .body(body)
+            .send()
+            .await
+        {
+            warn!(event_id, "Failed to update redacted message: {e}");
+        }
+    }
+
+    pub fn start_flush_task(self: &Arc<Self>) -> tokio::task::JoinHandle<()> {
+        let this = Arc::clone(self);
+        tokio::spawn(async move {
+            let mut tick = interval(Duration::from_millis(
+                this.config.opensearch.flush_interval_ms,
+            ));
+            loop {
+                tick.tick().await;
+                let mut buffer = this.buffer.lock().await;
+                if buffer.is_empty() {
+                    continue;
+                }
+                let docs: Vec<ArchiveDocument> = buffer.drain(..).collect();
+                drop(buffer);
+                if let Err(e) = this.flush_docs(docs).await {
+                    error!("Periodic flush failed: {e}");
+                }
+            }
+        })
+    }
+
+    async fn flush_docs(&self, docs: Vec<ArchiveDocument>) -> anyhow::Result<()> {
+        if docs.is_empty() {
+            return Ok(());
+        }
+
+        let index = &self.config.opensearch.index;
+        let pipeline = &self.config.opensearch.embedding_pipeline;
+
+        let mut body: Vec<JsonBody<serde_json::Value>> = Vec::with_capacity(docs.len() * 2);
+        for doc in &docs {
+            body.push(
+                json!({
+                    "index": {
+                        "_index": index,
+                        "_id": doc.event_id
+                    }
+                })
+                .into(),
+            );
+            body.push(serde_json::to_value(doc)?.into());
+        }
+
+        let response = self
+            .client
+            .bulk(opensearch::BulkParts::None)
+            .pipeline(pipeline)
+            .body(body)
+            .send()
+            .await?;
+
+        if !response.status_code().is_success() {
+            let text = response.text().await?;
+            anyhow::bail!("Bulk index failed: {text}");
+        }
+
+        let result: serde_json::Value = response.json().await?;
+        if result["errors"].as_bool().unwrap_or(false) {
+            warn!("Bulk index had errors: {}", serde_json::to_string_pretty(&result)?);
+        } else {
+            debug!(count = docs.len(), "Flushed documents to OpenSearch");
+        }
+
+        Ok(())
+    }
+}
--- a/src/archive/mod.rs
+++ b/src/archive/mod.rs
@@ -0,0 +1,2 @@
+pub mod indexer;
+pub mod schema;
--- a/src/archive/schema.rs
+++ b/src/archive/schema.rs
@@ -0,0 +1,205 @@
+use opensearch::OpenSearch;
+use serde::{Deserialize, Serialize};
+use tracing::info;
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ArchiveDocument {
+    pub event_id: String,
+    pub room_id: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub room_name: Option<String>,
+    pub sender: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub sender_name: Option<String>,
+    pub timestamp: i64,
+    pub content: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub reply_to: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub thread_id: Option<String>,
+    #[serde(default)]
+    pub media_urls: Vec<String>,
+    pub event_type: String,
+    #[serde(default)]
+    pub edited: bool,
+    #[serde(default)]
+    pub redacted: bool,
+}
+
+const INDEX_MAPPING: &str = r#"{
+    "settings": {
+        "number_of_shards": 1,
+        "number_of_replicas": 0
+    },
+    "mappings": {
+        "properties": {
+            "event_id":    { "type": "keyword" },
+            "room_id":     { "type": "keyword" },
+            "room_name":   { "type": "keyword" },
+            "sender":      { "type": "keyword" },
+            "sender_name": { "type": "keyword" },
+            "timestamp":   { "type": "date", "format": "epoch_millis" },
+            "content":     { "type": "text", "analyzer": "standard" },
+            "reply_to":    { "type": "keyword" },
+            "thread_id":   { "type": "keyword" },
+            "media_urls":  { "type": "keyword" },
+            "event_type":  { "type": "keyword" },
+            "edited":      { "type": "boolean" },
+            "redacted":    { "type": "boolean" }
+        }
+    }
+}"#;
+
+pub fn index_mapping_json() -> &'static str {
+    INDEX_MAPPING
+}
+
+pub async fn create_index_if_not_exists(client: &OpenSearch, index: &str) -> anyhow::Result<()> {
+    let exists = client
+        .indices()
+        .exists(opensearch::indices::IndicesExistsParts::Index(&[index]))
+        .send()
+        .await?;
+
+    if exists.status_code().is_success() {
+        info!(index, "OpenSearch index already exists");
+        return Ok(());
+    }
+
+    let mapping: serde_json::Value = serde_json::from_str(INDEX_MAPPING)?;
+    let response = client
+        .indices()
+        .create(opensearch::indices::IndicesCreateParts::Index(index))
+        .body(mapping)
+        .send()
+        .await?;
+
+    if !response.status_code().is_success() {
+        let body = response.text().await?;
+        anyhow::bail!("Failed to create index {index}: {body}");
+    }
+
+    info!(index, "Created OpenSearch index");
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn sample_doc() -> ArchiveDocument {
+        ArchiveDocument {
+            event_id: "$abc123:sunbeam.pt".to_string(),
+            room_id: "!room:sunbeam.pt".to_string(),
+            room_name: Some("general".to_string()),
+            sender: "@alice:sunbeam.pt".to_string(),
+            sender_name: Some("Alice".to_string()),
+            timestamp: 1710000000000,
+            content: "hello world".to_string(),
+            reply_to: None,
+            thread_id: None,
+            media_urls: vec![],
+            event_type: "m.room.message".to_string(),
+            edited: false,
+            redacted: false,
+        }
+    }
+
+    #[test]
+    fn test_serialize_full_doc() {
+        let doc = sample_doc();
+        let json = serde_json::to_value(&doc).unwrap();
+
+        assert_eq!(json["event_id"], "$abc123:sunbeam.pt");
+        assert_eq!(json["room_id"], "!room:sunbeam.pt");
+        assert_eq!(json["room_name"], "general");
+        assert_eq!(json["sender"], "@alice:sunbeam.pt");
+        assert_eq!(json["sender_name"], "Alice");
+        assert_eq!(json["timestamp"], 1710000000000_i64);
+        assert_eq!(json["content"], "hello world");
+        assert_eq!(json["event_type"], "m.room.message");
+        assert_eq!(json["edited"], false);
+        assert_eq!(json["redacted"], false);
+        assert!(json["media_urls"].as_array().unwrap().is_empty());
+    }
+
+    #[test]
+    fn test_skip_none_fields() {
+        let doc = sample_doc();
+        let json_str = serde_json::to_string(&doc).unwrap();
+        // reply_to and thread_id are None, should be omitted
+        assert!(!json_str.contains("reply_to"));
+        assert!(!json_str.contains("thread_id"));
+    }
+
+    #[test]
+    fn test_serialize_with_optional_fields() {
+        let mut doc = sample_doc();
+        doc.reply_to = Some("$parent:sunbeam.pt".to_string());
+        doc.thread_id = Some("$thread:sunbeam.pt".to_string());
+        doc.media_urls = vec!["mxc://sunbeam.pt/abc".to_string()];
+        doc.edited = true;
+
+        let json = serde_json::to_value(&doc).unwrap();
+        assert_eq!(json["reply_to"], "$parent:sunbeam.pt");
+        assert_eq!(json["thread_id"], "$thread:sunbeam.pt");
+        assert_eq!(json["media_urls"][0], "mxc://sunbeam.pt/abc");
+        assert_eq!(json["edited"], true);
+    }
+
+    #[test]
+    fn test_deserialize_roundtrip() {
+        let doc = sample_doc();
+        let json_str = serde_json::to_string(&doc).unwrap();
+        let deserialized: ArchiveDocument = serde_json::from_str(&json_str).unwrap();
+
+        assert_eq!(deserialized.event_id, doc.event_id);
+        assert_eq!(deserialized.room_id, doc.room_id);
+        assert_eq!(deserialized.room_name, doc.room_name);
+        assert_eq!(deserialized.sender, doc.sender);
+        assert_eq!(deserialized.content, doc.content);
+        assert_eq!(deserialized.timestamp, doc.timestamp);
+        assert_eq!(deserialized.edited, doc.edited);
+        assert_eq!(deserialized.redacted, doc.redacted);
+    }
+
+    #[test]
+    fn test_deserialize_with_defaults() {
+        // Simulate a document missing optional/default fields
+        let json = r#"{
+            "event_id": "$x:s",
+            "room_id": "!r:s",
+            "sender": "@a:s",
+            "timestamp": 1000,
+            "content": "test",
+            "event_type": "m.room.message"
+        }"#;
+        let doc: ArchiveDocument = serde_json::from_str(json).unwrap();
+        assert!(doc.room_name.is_none());
+        assert!(doc.sender_name.is_none());
+        assert!(doc.reply_to.is_none());
+        assert!(doc.thread_id.is_none());
+        assert!(doc.media_urls.is_empty());
+        assert!(!doc.edited);
+        assert!(!doc.redacted);
+    }
+
+    #[test]
+    fn test_index_mapping_is_valid_json() {
+        let mapping: serde_json::Value =
+            serde_json::from_str(index_mapping_json()).unwrap();
+        assert!(mapping["settings"]["number_of_shards"].is_number());
+        assert!(mapping["mappings"]["properties"]["event_id"]["type"]
+            .as_str()
+            .unwrap()
+            == "keyword");
+        assert!(mapping["mappings"]["properties"]["content"]["type"]
+            .as_str()
+            .unwrap()
+            == "text");
+        assert!(mapping["mappings"]["properties"]["timestamp"]["type"]
+            .as_str()
+            .unwrap()
+            == "date");
+    }
+}