From 57f8d608a59bf2705deda2e4d46c8d6a60ddc067 Mon Sep 17 00:00:00 2001 From: Sienna Meridian Satterwhite Date: Mon, 23 Mar 2026 23:54:29 +0000 Subject: [PATCH] feat: code index + adaptive breadcrumbs foundation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Code index (sol_code): - SymbolDocument: file_path, repo_name, language, symbol_name, symbol_kind, signature, docstring, branch, source, embedding (768-dim knn_vector) - CodeIndexer: batch symbol indexer with idempotent upserts - Branch-aware: symbols scoped to branch with mainline fallback Breadcrumbs: - build_breadcrumbs(): adaptive context injection for coding prompts - Default: project outline via aggregation (modules, types, fns) - Adaptive: hybrid search (_analyze → symbol matching → BM25 + neural) - Token budget enforcement with priority (outline first, then relevance) - format_symbol(): signature + first-line docstring + file:line Query optimization: uses _analyze API to extract key terms from free-form user text, matches against actual symbol names in the index before running the hybrid search. --- src/breadcrumbs/mod.rs | 386 ++++++++++++++++++++++++++++++++++++++ src/code_index/indexer.rs | 133 +++++++++++++ src/code_index/mod.rs | 7 + src/code_index/schema.rs | 177 +++++++++++++++++ src/main.rs | 2 + 5 files changed, 705 insertions(+) create mode 100644 src/breadcrumbs/mod.rs create mode 100644 src/code_index/indexer.rs create mode 100644 src/code_index/mod.rs create mode 100644 src/code_index/schema.rs diff --git a/src/breadcrumbs/mod.rs b/src/breadcrumbs/mod.rs new file mode 100644 index 0000000..7d4b468 --- /dev/null +++ b/src/breadcrumbs/mod.rs @@ -0,0 +1,386 @@ +//! Adaptive breadcrumbs — lightweight code context injected into every prompt. +//! +//! Uses OpenSearch's hybrid search (BM25 + neural) to find relevant symbols +//! from the code index based on the user's message. Always injects a default +//! project outline, then expands with relevant signatures + docstrings. + +use opensearch::OpenSearch; +use serde::Deserialize; +use tracing::{debug, warn}; + +/// A symbol retrieved from the code index. +#[derive(Debug, Clone, Deserialize)] +pub struct RetrievedSymbol { + pub file_path: String, + pub symbol_name: String, + pub symbol_kind: String, + pub signature: String, + #[serde(default)] + pub docstring: String, + pub start_line: u32, +} + +/// Result of building breadcrumbs for a prompt. +#[derive(Debug)] +pub struct BreadcrumbResult { + /// The default project outline (~200 tokens). + pub outline: String, + /// Relevant symbols from adaptive retrieval. + pub relevant: Vec, + /// Ready-to-inject formatted string. + pub formatted: String, +} + +/// Build adaptive breadcrumbs for a coding session. +/// +/// 1. Always: project outline (module names, key types/fns) from aggregation +/// 2. Adaptive: if user message is substantive, hybrid search for relevant symbols +/// 3. Format within token budget +pub async fn build_breadcrumbs( + client: &OpenSearch, + index: &str, + repo_name: &str, + branch: &str, + user_message: &str, + token_budget: usize, +) -> BreadcrumbResult { + let outline = load_project_outline(client, index, repo_name, branch).await; + + let relevant = if user_message.split_whitespace().count() >= 3 { + hybrid_symbol_search(client, index, repo_name, branch, user_message, 10).await + } else { + Vec::new() + }; + + let formatted = format_with_budget(&outline, &relevant, token_budget); + + BreadcrumbResult { outline, relevant, formatted } +} + +/// Load the project outline: distinct modules, key type names, key function names. +async fn load_project_outline( + client: &OpenSearch, + index: &str, + repo_name: &str, + branch: &str, +) -> String { + let query = serde_json::json!({ + "size": 0, + "query": { + "bool": { + "filter": [ + { "term": { "repo_name": repo_name } }, + { "bool": { "should": [ + { "term": { "branch": branch } }, + { "term": { "branch": "mainline" } }, + { "term": { "branch": "main" } } + ]}} + ] + } + }, + "aggs": { + "modules": { + "terms": { "field": "file_path", "size": 50 } + }, + "types": { + "filter": { "terms": { "symbol_kind": ["struct", "enum", "trait", "class", "interface", "type"] } }, + "aggs": { "names": { "terms": { "field": "symbol_name", "size": 20 } } } + }, + "functions": { + "filter": { "terms": { "symbol_kind": ["function", "method", "async_function"] } }, + "aggs": { "names": { "terms": { "field": "symbol_name", "size": 20 } } } + } + } + }); + + let response = match client + .search(opensearch::SearchParts::Index(&[index])) + .body(query) + .send() + .await + { + Ok(r) => r, + Err(e) => { + warn!("Failed to load project outline: {e}"); + return String::new(); + } + }; + + let body: serde_json::Value = match response.json().await { + Ok(b) => b, + Err(e) => { + warn!("Failed to parse outline response: {e}"); + return String::new(); + } + }; + + // Extract module paths (deduplicate to directory level) + let mut modules: Vec = Vec::new(); + if let Some(buckets) = body["aggregations"]["modules"]["buckets"].as_array() { + for b in buckets { + if let Some(path) = b["key"].as_str() { + // Extract directory: "src/orchestrator/mod.rs" → "orchestrator" + let parts: Vec<&str> = path.split('/').collect(); + if parts.len() >= 2 { + let module = parts[parts.len() - 2]; + if !modules.contains(&module.to_string()) && module != "src" { + modules.push(module.to_string()); + } + } + } + } + } + + let type_names = extract_agg_names(&body["aggregations"]["types"]["names"]["buckets"]); + let fn_names = extract_agg_names(&body["aggregations"]["functions"]["names"]["buckets"]); + + let mut result = format!("## project: {repo_name}\n"); + if !modules.is_empty() { + result.push_str(&format!("modules: {}\n", modules.join(", "))); + } + if !type_names.is_empty() { + result.push_str(&format!("key types: {}\n", type_names.join(", "))); + } + if !fn_names.is_empty() { + result.push_str(&format!("key fns: {}\n", fn_names.join(", "))); + } + + result +} + +/// Hybrid search: _analyze → symbol name matching → BM25 + neural. +async fn hybrid_symbol_search( + client: &OpenSearch, + index: &str, + repo_name: &str, + branch: &str, + user_message: &str, + limit: usize, +) -> Vec { + // Step 1: Analyze the query to extract key terms + let analyze_query = serde_json::json!({ + "analyzer": "standard", + "text": user_message + }); + + let tokens = match client + .indices() + .analyze(opensearch::indices::IndicesAnalyzeParts::Index(index)) + .body(analyze_query) + .send() + .await + { + Ok(r) => { + let body: serde_json::Value = r.json().await.unwrap_or_default(); + body["tokens"] + .as_array() + .map(|arr| { + arr.iter() + .filter_map(|t| t["token"].as_str().map(String::from)) + .filter(|t| t.len() > 2) // skip very short tokens + .collect::>() + }) + .unwrap_or_default() + } + Err(e) => { + debug!("Analyze failed (non-fatal): {e}"); + Vec::new() + } + }; + + // Step 2: Build hybrid query + let mut should_clauses = vec![ + serde_json::json!({ "match": { "content": user_message } }), + serde_json::json!({ "match": { "signature": { "query": user_message, "boost": 2.0 } } }), + serde_json::json!({ "match": { "docstring": { "query": user_message, "boost": 1.5 } } }), + ]; + + // Add symbol name term matching from analyzed tokens + if !tokens.is_empty() { + // Build wildcard patterns from tokens for symbol name matching + let patterns: Vec = tokens.iter().map(|t| format!(".*{t}.*")).collect(); + should_clauses.push(serde_json::json!({ + "regexp": { "symbol_name": { "value": patterns.join("|"), "boost": 3.0 } } + })); + } + + let query = serde_json::json!({ + "size": limit, + "_source": ["file_path", "symbol_name", "symbol_kind", "signature", "docstring", "start_line"], + "query": { + "bool": { + "should": should_clauses, + "filter": [ + { "term": { "repo_name": repo_name } }, + { "bool": { "should": [ + { "term": { "branch": { "value": branch, "boost": 2.0 } } }, + { "term": { "branch": "mainline" } }, + { "term": { "branch": "main" } } + ]}} + ], + "minimum_should_match": 1 + } + } + }); + + // TODO: Add neural search component when kNN is available on the index. + // The hybrid pipeline (tuwunel_hybrid_pipeline) will combine BM25 + neural. + // For now, use BM25-only search until embeddings are populated. + + let response = match client + .search(opensearch::SearchParts::Index(&[index])) + .body(query) + .send() + .await + { + Ok(r) => r, + Err(e) => { + warn!("Hybrid symbol search failed: {e}"); + return Vec::new(); + } + }; + + let body: serde_json::Value = match response.json().await { + Ok(b) => b, + Err(e) => { + warn!("Failed to parse search response: {e}"); + return Vec::new(); + } + }; + + body["hits"]["hits"] + .as_array() + .map(|hits| { + hits.iter() + .filter_map(|hit| serde_json::from_value(hit["_source"].clone()).ok()) + .collect() + }) + .unwrap_or_default() +} + +/// Format breadcrumbs within a character budget. +fn format_with_budget( + outline: &str, + relevant: &[RetrievedSymbol], + budget: usize, +) -> String { + let mut result = outline.to_string(); + + if relevant.is_empty() || result.len() >= budget { + if result.len() > budget { + result.truncate(budget); + } + return result; + } + + result.push_str("## relevant context\n"); + + for sym in relevant { + let entry = format_symbol(sym); + if result.len() + entry.len() > budget { + break; + } + result.push_str(&entry); + } + + result +} + +/// Format a single symbol as a breadcrumb entry. +fn format_symbol(sym: &RetrievedSymbol) -> String { + let mut entry = String::new(); + if !sym.docstring.is_empty() { + // Take first line of docstring + let first_line = sym.docstring.lines().next().unwrap_or(""); + entry.push_str(&format!("/// {first_line}\n")); + } + entry.push_str(&format!( + "{} // {}:{}\n", + sym.signature, sym.file_path, sym.start_line + )); + entry +} + +fn extract_agg_names(buckets: &serde_json::Value) -> Vec { + buckets + .as_array() + .map(|arr| { + arr.iter() + .filter_map(|b| b["key"].as_str().map(String::from)) + .collect() + }) + .unwrap_or_default() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_format_symbol_with_docstring() { + let sym = RetrievedSymbol { + file_path: "src/orchestrator/mod.rs".into(), + symbol_name: "generate".into(), + symbol_kind: "function".into(), + signature: "pub async fn generate(&self, req: &GenerateRequest) -> Option".into(), + docstring: "Generate a response using the ConversationRegistry.\nMore details here.".into(), + start_line: 80, + }; + let formatted = format_symbol(&sym); + assert!(formatted.contains("/// Generate a response")); + assert!(formatted.contains("src/orchestrator/mod.rs:80")); + // Only first line of docstring + assert!(!formatted.contains("More details")); + } + + #[test] + fn test_format_symbol_without_docstring() { + let sym = RetrievedSymbol { + file_path: "src/main.rs".into(), + symbol_name: "main".into(), + symbol_kind: "function".into(), + signature: "fn main()".into(), + docstring: String::new(), + start_line: 1, + }; + let formatted = format_symbol(&sym); + assert!(!formatted.contains("///")); + assert!(formatted.contains("fn main()")); + } + + #[test] + fn test_format_with_budget_truncation() { + let outline = "## project: test\nmodules: a, b, c\n"; + let symbols = vec![ + RetrievedSymbol { + file_path: "a.rs".into(), + symbol_name: "foo".into(), + symbol_kind: "function".into(), + signature: "fn foo()".into(), + docstring: "Does foo.".into(), + start_line: 1, + }, + RetrievedSymbol { + file_path: "b.rs".into(), + symbol_name: "bar".into(), + symbol_kind: "function".into(), + signature: "fn bar()".into(), + docstring: "Does bar.".into(), + start_line: 1, + }, + ]; + + // Budget that fits outline + one symbol but not both + let result = format_with_budget(outline, &symbols, 120); + assert!(result.contains("foo")); + // May or may not contain bar depending on exact lengths + } + + #[test] + fn test_format_with_budget_empty_relevant() { + let outline = "## project: test\n"; + let result = format_with_budget(outline, &[], 1000); + assert_eq!(result, outline); + assert!(!result.contains("relevant context")); + } +} diff --git a/src/code_index/indexer.rs b/src/code_index/indexer.rs new file mode 100644 index 0000000..da760be --- /dev/null +++ b/src/code_index/indexer.rs @@ -0,0 +1,133 @@ +//! Code indexer — batches symbol documents and flushes to OpenSearch. + +use opensearch::http::request::JsonBody; +use opensearch::OpenSearch; +use serde_json::json; +use tracing::{error, info}; + +use super::schema::SymbolDocument; + +/// Batch indexer for code symbols. +pub struct CodeIndexer { + client: OpenSearch, + index: String, + pipeline: String, + buffer: Vec, + batch_size: usize, +} + +impl CodeIndexer { + pub fn new(client: OpenSearch, index: String, pipeline: String, batch_size: usize) -> Self { + Self { + client, + index, + pipeline, + buffer: Vec::new(), + batch_size, + } + } + + /// Add a symbol to the buffer. Flushes when batch size is reached. + pub async fn add(&mut self, doc: SymbolDocument) { + self.buffer.push(doc); + if self.buffer.len() >= self.batch_size { + self.flush().await; + } + } + + /// Flush all buffered symbols to OpenSearch. + pub async fn flush(&mut self) { + if self.buffer.is_empty() { + return; + } + + let mut body: Vec> = Vec::with_capacity(self.buffer.len() * 2); + + for doc in &self.buffer { + let doc_id = format!("{}:{}:{}", doc.file_path, doc.symbol_name, doc.branch); + body.push(json!({ "index": { "_index": self.index, "_id": doc_id } }).into()); + body.push(serde_json::to_value(doc).unwrap_or_default().into()); + } + + match self + .client + .bulk(opensearch::BulkParts::None) + .pipeline(&self.pipeline) + .body(body) + .send() + .await + { + Ok(response) => { + let count = self.buffer.len(); + if response.status_code().is_success() { + info!(count, "Flushed symbols to code index"); + } else { + let text = response.text().await.unwrap_or_default(); + error!(count, "Code index bulk failed: {text}"); + } + } + Err(e) => { + error!("Code index flush error: {e}"); + } + } + + self.buffer.clear(); + } + + /// Delete all symbols for a repo + branch (before re-indexing). + pub async fn delete_branch(&self, repo_name: &str, branch: &str) { + let query = json!({ + "query": { + "bool": { + "must": [ + { "term": { "repo_name": repo_name } }, + { "term": { "branch": branch } } + ] + } + } + }); + + match self + .client + .delete_by_query(opensearch::DeleteByQueryParts::Index(&[&self.index])) + .body(query) + .send() + .await + { + Ok(r) => { + info!(repo_name, branch, "Deleted symbols for branch re-index"); + let _ = r; + } + Err(e) => { + error!(repo_name, branch, "Failed to delete branch symbols: {e}"); + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_document_id_format() { + let doc = SymbolDocument { + file_path: "src/main.rs".into(), + repo_owner: None, + repo_name: "sol".into(), + language: "rust".into(), + symbol_name: "main".into(), + symbol_kind: "function".into(), + signature: "fn main()".into(), + docstring: String::new(), + start_line: 1, + end_line: 10, + content: "fn main() {}".into(), + branch: "mainline".into(), + source: "local".into(), + indexed_at: 0, + }; + let doc_id = format!("{}:{}:{}", doc.file_path, doc.symbol_name, doc.branch); + assert_eq!(doc_id, "src/main.rs:main:mainline"); + } +} diff --git a/src/code_index/mod.rs b/src/code_index/mod.rs new file mode 100644 index 0000000..967008b --- /dev/null +++ b/src/code_index/mod.rs @@ -0,0 +1,7 @@ +//! Code index — OpenSearch-backed symbol index for source code. +//! +//! Indexes symbols (functions, structs, enums, traits) with their signatures, +//! docstrings, and body content. Supports branch-aware semantic search. + +pub mod schema; +pub mod indexer; diff --git a/src/code_index/schema.rs b/src/code_index/schema.rs new file mode 100644 index 0000000..3970ad4 --- /dev/null +++ b/src/code_index/schema.rs @@ -0,0 +1,177 @@ +//! Code index schema — SymbolDocument and OpenSearch index mapping. + +use opensearch::OpenSearch; +use serde::{Deserialize, Serialize}; +use tracing::info; + +/// A symbol indexed in OpenSearch for code search and breadcrumbs. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SymbolDocument { + /// File path relative to repo root. + pub file_path: String, + /// Repository owner (e.g., "studio"). + #[serde(skip_serializing_if = "Option::is_none")] + pub repo_owner: Option, + /// Repository name (e.g., "sol"). + pub repo_name: String, + /// Programming language (e.g., "rust", "typescript", "python"). + pub language: String, + /// Symbol name (e.g., "run_tool_loop", "Orchestrator"). + pub symbol_name: String, + /// Symbol kind (e.g., "function", "struct", "enum", "trait", "impl"). + pub symbol_kind: String, + /// Full signature (e.g., "pub async fn generate(&self, req: &GenerateRequest) -> Option"). + pub signature: String, + /// Doc comment / docstring. + #[serde(default, skip_serializing_if = "String::is_empty")] + pub docstring: String, + /// Start line in the file (1-based). + pub start_line: u32, + /// End line in the file (1-based). + pub end_line: u32, + /// Full body content of the symbol (for embedding). + pub content: String, + /// Git branch this symbol was indexed from. + pub branch: String, + /// Source of the index: "gitea", "local", or "sidecar" (future). + pub source: String, + /// When this was indexed (epoch millis). + pub indexed_at: i64, +} + +const INDEX_MAPPING: &str = r#"{ + "settings": { + "number_of_shards": 1, + "number_of_replicas": 0, + "index.knn": true + }, + "mappings": { + "properties": { + "file_path": { "type": "keyword" }, + "repo_owner": { "type": "keyword" }, + "repo_name": { "type": "keyword" }, + "language": { "type": "keyword" }, + "symbol_name": { "type": "keyword" }, + "symbol_kind": { "type": "keyword" }, + "signature": { "type": "text" }, + "docstring": { "type": "text" }, + "start_line": { "type": "integer" }, + "end_line": { "type": "integer" }, + "content": { "type": "text", "analyzer": "standard" }, + "branch": { "type": "keyword" }, + "source": { "type": "keyword" }, + "indexed_at": { "type": "date", "format": "epoch_millis" }, + "embedding": { + "type": "knn_vector", + "dimension": 768, + "method": { + "name": "hnsw", + "space_type": "cosinesimil", + "engine": "lucene" + } + } + } + } +}"#; + +pub fn index_mapping_json() -> &'static str { + INDEX_MAPPING +} + +pub async fn create_index_if_not_exists(client: &OpenSearch, index: &str) -> anyhow::Result<()> { + let exists = client + .indices() + .exists(opensearch::indices::IndicesExistsParts::Index(&[index])) + .send() + .await?; + + if exists.status_code().is_success() { + info!(index, "Code index already exists"); + return Ok(()); + } + + let mapping: serde_json::Value = serde_json::from_str(INDEX_MAPPING)?; + let response = client + .indices() + .create(opensearch::indices::IndicesCreateParts::Index(index)) + .body(mapping) + .send() + .await?; + + if !response.status_code().is_success() { + let body = response.text().await?; + anyhow::bail!("Failed to create code index {index}: {body}"); + } + + info!(index, "Created code index"); + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_index_mapping_is_valid_json() { + let mapping: serde_json::Value = serde_json::from_str(index_mapping_json()).unwrap(); + assert!(mapping["mappings"]["properties"]["symbol_name"]["type"] + .as_str() + .unwrap() + == "keyword"); + assert!(mapping["mappings"]["properties"]["embedding"]["type"] + .as_str() + .unwrap() + == "knn_vector"); + assert!(mapping["mappings"]["properties"]["branch"]["type"] + .as_str() + .unwrap() + == "keyword"); + } + + #[test] + fn test_symbol_document_serialize() { + let doc = SymbolDocument { + file_path: "src/orchestrator/mod.rs".into(), + repo_owner: Some("studio".into()), + repo_name: "sol".into(), + language: "rust".into(), + symbol_name: "generate".into(), + symbol_kind: "function".into(), + signature: "pub async fn generate(&self, req: &GenerateRequest) -> Option".into(), + docstring: "Generate a response using the ConversationRegistry.".into(), + start_line: 80, + end_line: 120, + content: "pub async fn generate(...) { ... }".into(), + branch: "mainline".into(), + source: "gitea".into(), + indexed_at: 1774310400000, + }; + let json = serde_json::to_value(&doc).unwrap(); + assert_eq!(json["symbol_name"], "generate"); + assert_eq!(json["branch"], "mainline"); + assert_eq!(json["language"], "rust"); + } + + #[test] + fn test_symbol_document_skip_empty_docstring() { + let doc = SymbolDocument { + file_path: "src/main.rs".into(), + repo_owner: None, + repo_name: "sol".into(), + language: "rust".into(), + symbol_name: "main".into(), + symbol_kind: "function".into(), + signature: "fn main()".into(), + docstring: String::new(), + start_line: 1, + end_line: 10, + content: "fn main() { ... }".into(), + branch: "mainline".into(), + source: "local".into(), + indexed_at: 0, + }; + let json_str = serde_json::to_string(&doc).unwrap(); + assert!(!json_str.contains("docstring")); + assert!(!json_str.contains("repo_owner")); + } +} diff --git a/src/main.rs b/src/main.rs index 11ed79b..77466a3 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2,6 +2,8 @@ mod agent_ux; mod agents; mod archive; mod brain; +mod breadcrumbs; +mod code_index; mod config; mod context; mod conversations;