feat: initial Sol virtual librarian implementation
Matrix bot with E2EE (matrix-sdk 0.9) that passively archives all messages to OpenSearch and responds to queries via Mistral AI with function calling tools. Core systems: - Archive: bulk OpenSearch indexer with batch/flush, edit/redaction handling, embedding pipeline passthrough - Brain: rule-based engagement evaluator (mentions, DMs, name invocations), LLM-powered spontaneous engagement, per-room conversation context windows, response delay simulation - Tools: search_archive, get_room_context, list_rooms, get_room_members registered as Mistral function calling tools with iterative tool loop - Personality: templated system prompt with Sol's librarian persona 47 unit tests covering config, evaluator, conversation windowing, personality templates, schema serialization, and search query building.
This commit is contained in:
148
src/archive/indexer.rs
Normal file
148
src/archive/indexer.rs
Normal file
@@ -0,0 +1,148 @@
|
||||
use std::sync::Arc;
|
||||
|
||||
use opensearch::http::request::JsonBody;
|
||||
use opensearch::OpenSearch;
|
||||
use serde_json::json;
|
||||
use tokio::sync::Mutex;
|
||||
use tokio::time::{interval, Duration};
|
||||
use tracing::{debug, error, warn};
|
||||
|
||||
use crate::config::Config;
|
||||
use super::schema::ArchiveDocument;
|
||||
|
||||
pub struct Indexer {
|
||||
buffer: Arc<Mutex<Vec<ArchiveDocument>>>,
|
||||
client: OpenSearch,
|
||||
config: Arc<Config>,
|
||||
}
|
||||
|
||||
impl Indexer {
|
||||
pub fn new(client: OpenSearch, config: Arc<Config>) -> Self {
|
||||
Self {
|
||||
buffer: Arc::new(Mutex::new(Vec::new())),
|
||||
client,
|
||||
config,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn add(&self, doc: ArchiveDocument) {
|
||||
let mut buffer = self.buffer.lock().await;
|
||||
buffer.push(doc);
|
||||
let batch_size = self.config.opensearch.batch_size;
|
||||
if buffer.len() >= batch_size {
|
||||
let docs: Vec<ArchiveDocument> = buffer.drain(..).collect();
|
||||
drop(buffer);
|
||||
if let Err(e) = self.flush_docs(docs).await {
|
||||
error!("Failed to flush archive batch: {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn update_edit(&self, event_id: &str, new_content: &str) {
|
||||
let body = json!({
|
||||
"doc": {
|
||||
"content": new_content,
|
||||
"edited": true
|
||||
}
|
||||
});
|
||||
if let Err(e) = self
|
||||
.client
|
||||
.update(opensearch::UpdateParts::IndexId(
|
||||
&self.config.opensearch.index,
|
||||
event_id,
|
||||
))
|
||||
.body(body)
|
||||
.send()
|
||||
.await
|
||||
{
|
||||
warn!(event_id, "Failed to update edited message: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn update_redaction(&self, event_id: &str) {
|
||||
let body = json!({
|
||||
"doc": {
|
||||
"content": "",
|
||||
"redacted": true
|
||||
}
|
||||
});
|
||||
if let Err(e) = self
|
||||
.client
|
||||
.update(opensearch::UpdateParts::IndexId(
|
||||
&self.config.opensearch.index,
|
||||
event_id,
|
||||
))
|
||||
.body(body)
|
||||
.send()
|
||||
.await
|
||||
{
|
||||
warn!(event_id, "Failed to update redacted message: {e}");
|
||||
}
|
||||
}
|
||||
|
||||
pub fn start_flush_task(self: &Arc<Self>) -> tokio::task::JoinHandle<()> {
|
||||
let this = Arc::clone(self);
|
||||
tokio::spawn(async move {
|
||||
let mut tick = interval(Duration::from_millis(
|
||||
this.config.opensearch.flush_interval_ms,
|
||||
));
|
||||
loop {
|
||||
tick.tick().await;
|
||||
let mut buffer = this.buffer.lock().await;
|
||||
if buffer.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let docs: Vec<ArchiveDocument> = buffer.drain(..).collect();
|
||||
drop(buffer);
|
||||
if let Err(e) = this.flush_docs(docs).await {
|
||||
error!("Periodic flush failed: {e}");
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
async fn flush_docs(&self, docs: Vec<ArchiveDocument>) -> anyhow::Result<()> {
|
||||
if docs.is_empty() {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let index = &self.config.opensearch.index;
|
||||
let pipeline = &self.config.opensearch.embedding_pipeline;
|
||||
|
||||
let mut body: Vec<JsonBody<serde_json::Value>> = Vec::with_capacity(docs.len() * 2);
|
||||
for doc in &docs {
|
||||
body.push(
|
||||
json!({
|
||||
"index": {
|
||||
"_index": index,
|
||||
"_id": doc.event_id
|
||||
}
|
||||
})
|
||||
.into(),
|
||||
);
|
||||
body.push(serde_json::to_value(doc)?.into());
|
||||
}
|
||||
|
||||
let response = self
|
||||
.client
|
||||
.bulk(opensearch::BulkParts::None)
|
||||
.pipeline(pipeline)
|
||||
.body(body)
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
if !response.status_code().is_success() {
|
||||
let text = response.text().await?;
|
||||
anyhow::bail!("Bulk index failed: {text}");
|
||||
}
|
||||
|
||||
let result: serde_json::Value = response.json().await?;
|
||||
if result["errors"].as_bool().unwrap_or(false) {
|
||||
warn!("Bulk index had errors: {}", serde_json::to_string_pretty(&result)?);
|
||||
} else {
|
||||
debug!(count = docs.len(), "Flushed documents to OpenSearch");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
2
src/archive/mod.rs
Normal file
2
src/archive/mod.rs
Normal file
@@ -0,0 +1,2 @@
|
||||
pub mod indexer;
|
||||
pub mod schema;
|
||||
205
src/archive/schema.rs
Normal file
205
src/archive/schema.rs
Normal file
@@ -0,0 +1,205 @@
|
||||
use opensearch::OpenSearch;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tracing::info;
|
||||
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
pub struct ArchiveDocument {
|
||||
pub event_id: String,
|
||||
pub room_id: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub room_name: Option<String>,
|
||||
pub sender: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub sender_name: Option<String>,
|
||||
pub timestamp: i64,
|
||||
pub content: String,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub reply_to: Option<String>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub thread_id: Option<String>,
|
||||
#[serde(default)]
|
||||
pub media_urls: Vec<String>,
|
||||
pub event_type: String,
|
||||
#[serde(default)]
|
||||
pub edited: bool,
|
||||
#[serde(default)]
|
||||
pub redacted: bool,
|
||||
}
|
||||
|
||||
const INDEX_MAPPING: &str = r#"{
|
||||
"settings": {
|
||||
"number_of_shards": 1,
|
||||
"number_of_replicas": 0
|
||||
},
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"event_id": { "type": "keyword" },
|
||||
"room_id": { "type": "keyword" },
|
||||
"room_name": { "type": "keyword" },
|
||||
"sender": { "type": "keyword" },
|
||||
"sender_name": { "type": "keyword" },
|
||||
"timestamp": { "type": "date", "format": "epoch_millis" },
|
||||
"content": { "type": "text", "analyzer": "standard" },
|
||||
"reply_to": { "type": "keyword" },
|
||||
"thread_id": { "type": "keyword" },
|
||||
"media_urls": { "type": "keyword" },
|
||||
"event_type": { "type": "keyword" },
|
||||
"edited": { "type": "boolean" },
|
||||
"redacted": { "type": "boolean" }
|
||||
}
|
||||
}
|
||||
}"#;
|
||||
|
||||
pub fn index_mapping_json() -> &'static str {
|
||||
INDEX_MAPPING
|
||||
}
|
||||
|
||||
pub async fn create_index_if_not_exists(client: &OpenSearch, index: &str) -> anyhow::Result<()> {
|
||||
let exists = client
|
||||
.indices()
|
||||
.exists(opensearch::indices::IndicesExistsParts::Index(&[index]))
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
if exists.status_code().is_success() {
|
||||
info!(index, "OpenSearch index already exists");
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
let mapping: serde_json::Value = serde_json::from_str(INDEX_MAPPING)?;
|
||||
let response = client
|
||||
.indices()
|
||||
.create(opensearch::indices::IndicesCreateParts::Index(index))
|
||||
.body(mapping)
|
||||
.send()
|
||||
.await?;
|
||||
|
||||
if !response.status_code().is_success() {
|
||||
let body = response.text().await?;
|
||||
anyhow::bail!("Failed to create index {index}: {body}");
|
||||
}
|
||||
|
||||
info!(index, "Created OpenSearch index");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn sample_doc() -> ArchiveDocument {
|
||||
ArchiveDocument {
|
||||
event_id: "$abc123:sunbeam.pt".to_string(),
|
||||
room_id: "!room:sunbeam.pt".to_string(),
|
||||
room_name: Some("general".to_string()),
|
||||
sender: "@alice:sunbeam.pt".to_string(),
|
||||
sender_name: Some("Alice".to_string()),
|
||||
timestamp: 1710000000000,
|
||||
content: "hello world".to_string(),
|
||||
reply_to: None,
|
||||
thread_id: None,
|
||||
media_urls: vec![],
|
||||
event_type: "m.room.message".to_string(),
|
||||
edited: false,
|
||||
redacted: false,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_full_doc() {
|
||||
let doc = sample_doc();
|
||||
let json = serde_json::to_value(&doc).unwrap();
|
||||
|
||||
assert_eq!(json["event_id"], "$abc123:sunbeam.pt");
|
||||
assert_eq!(json["room_id"], "!room:sunbeam.pt");
|
||||
assert_eq!(json["room_name"], "general");
|
||||
assert_eq!(json["sender"], "@alice:sunbeam.pt");
|
||||
assert_eq!(json["sender_name"], "Alice");
|
||||
assert_eq!(json["timestamp"], 1710000000000_i64);
|
||||
assert_eq!(json["content"], "hello world");
|
||||
assert_eq!(json["event_type"], "m.room.message");
|
||||
assert_eq!(json["edited"], false);
|
||||
assert_eq!(json["redacted"], false);
|
||||
assert!(json["media_urls"].as_array().unwrap().is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_skip_none_fields() {
|
||||
let doc = sample_doc();
|
||||
let json_str = serde_json::to_string(&doc).unwrap();
|
||||
// reply_to and thread_id are None, should be omitted
|
||||
assert!(!json_str.contains("reply_to"));
|
||||
assert!(!json_str.contains("thread_id"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_with_optional_fields() {
|
||||
let mut doc = sample_doc();
|
||||
doc.reply_to = Some("$parent:sunbeam.pt".to_string());
|
||||
doc.thread_id = Some("$thread:sunbeam.pt".to_string());
|
||||
doc.media_urls = vec!["mxc://sunbeam.pt/abc".to_string()];
|
||||
doc.edited = true;
|
||||
|
||||
let json = serde_json::to_value(&doc).unwrap();
|
||||
assert_eq!(json["reply_to"], "$parent:sunbeam.pt");
|
||||
assert_eq!(json["thread_id"], "$thread:sunbeam.pt");
|
||||
assert_eq!(json["media_urls"][0], "mxc://sunbeam.pt/abc");
|
||||
assert_eq!(json["edited"], true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_deserialize_roundtrip() {
|
||||
let doc = sample_doc();
|
||||
let json_str = serde_json::to_string(&doc).unwrap();
|
||||
let deserialized: ArchiveDocument = serde_json::from_str(&json_str).unwrap();
|
||||
|
||||
assert_eq!(deserialized.event_id, doc.event_id);
|
||||
assert_eq!(deserialized.room_id, doc.room_id);
|
||||
assert_eq!(deserialized.room_name, doc.room_name);
|
||||
assert_eq!(deserialized.sender, doc.sender);
|
||||
assert_eq!(deserialized.content, doc.content);
|
||||
assert_eq!(deserialized.timestamp, doc.timestamp);
|
||||
assert_eq!(deserialized.edited, doc.edited);
|
||||
assert_eq!(deserialized.redacted, doc.redacted);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_deserialize_with_defaults() {
|
||||
// Simulate a document missing optional/default fields
|
||||
let json = r#"{
|
||||
"event_id": "$x:s",
|
||||
"room_id": "!r:s",
|
||||
"sender": "@a:s",
|
||||
"timestamp": 1000,
|
||||
"content": "test",
|
||||
"event_type": "m.room.message"
|
||||
}"#;
|
||||
let doc: ArchiveDocument = serde_json::from_str(json).unwrap();
|
||||
assert!(doc.room_name.is_none());
|
||||
assert!(doc.sender_name.is_none());
|
||||
assert!(doc.reply_to.is_none());
|
||||
assert!(doc.thread_id.is_none());
|
||||
assert!(doc.media_urls.is_empty());
|
||||
assert!(!doc.edited);
|
||||
assert!(!doc.redacted);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_mapping_is_valid_json() {
|
||||
let mapping: serde_json::Value =
|
||||
serde_json::from_str(index_mapping_json()).unwrap();
|
||||
assert!(mapping["settings"]["number_of_shards"].is_number());
|
||||
assert!(mapping["mappings"]["properties"]["event_id"]["type"]
|
||||
.as_str()
|
||||
.unwrap()
|
||||
== "keyword");
|
||||
assert!(mapping["mappings"]["properties"]["content"]["type"]
|
||||
.as_str()
|
||||
.unwrap()
|
||||
== "text");
|
||||
assert!(mapping["mappings"]["properties"]["timestamp"]["type"]
|
||||
.as_str()
|
||||
.unwrap()
|
||||
== "date");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user