//! Adaptive breadcrumbs — lightweight code context injected into every prompt. //! //! Uses OpenSearch's hybrid search (BM25 + neural) to find relevant symbols //! from the code index based on the user's message. Always injects a default //! project outline, then expands with relevant signatures + docstrings. use opensearch::OpenSearch; use serde::Deserialize; use tracing::{debug, warn}; /// A symbol retrieved from the code index. #[derive(Debug, Clone, Deserialize)] pub struct RetrievedSymbol { pub file_path: String, pub symbol_name: String, pub symbol_kind: String, pub signature: String, #[serde(default)] pub docstring: String, pub start_line: u32, } /// Result of building breadcrumbs for a prompt. #[derive(Debug)] pub struct BreadcrumbResult { /// The default project outline (~200 tokens). pub outline: String, /// Relevant symbols from adaptive retrieval. pub relevant: Vec, /// Ready-to-inject formatted string. pub formatted: String, } /// Build adaptive breadcrumbs for a coding session. /// /// 1. Always: project outline (module names, key types/fns) from aggregation /// 2. Adaptive: if user message is substantive, hybrid search for relevant symbols /// 3. Format within token budget pub async fn build_breadcrumbs( client: &OpenSearch, index: &str, repo_name: &str, branch: &str, user_message: &str, token_budget: usize, ) -> BreadcrumbResult { let outline = load_project_outline(client, index, repo_name, branch).await; let relevant = if user_message.split_whitespace().count() >= 3 { hybrid_symbol_search(client, index, repo_name, branch, user_message, 10).await } else { Vec::new() }; let formatted = format_with_budget(&outline, &relevant, token_budget); BreadcrumbResult { outline, relevant, formatted } } /// Load the project outline: distinct modules, key type names, key function names. async fn load_project_outline( client: &OpenSearch, index: &str, repo_name: &str, branch: &str, ) -> String { let query = serde_json::json!({ "size": 0, "query": { "bool": { "filter": [ { "term": { "repo_name": repo_name } }, { "bool": { "should": [ { "term": { "branch": branch } }, { "term": { "branch": "mainline" } }, { "term": { "branch": "main" } } ]}} ] } }, "aggs": { "modules": { "terms": { "field": "file_path", "size": 50 } }, "types": { "filter": { "terms": { "symbol_kind": ["struct", "enum", "trait", "class", "interface", "type"] } }, "aggs": { "names": { "terms": { "field": "symbol_name", "size": 20 } } } }, "functions": { "filter": { "terms": { "symbol_kind": ["function", "method", "async_function"] } }, "aggs": { "names": { "terms": { "field": "symbol_name", "size": 20 } } } } } }); let response = match client .search(opensearch::SearchParts::Index(&[index])) .body(query) .send() .await { Ok(r) => r, Err(e) => { warn!("Failed to load project outline: {e}"); return String::new(); } }; let body: serde_json::Value = match response.json().await { Ok(b) => b, Err(e) => { warn!("Failed to parse outline response: {e}"); return String::new(); } }; // Extract module paths (deduplicate to directory level) let mut modules: Vec = Vec::new(); if let Some(buckets) = body["aggregations"]["modules"]["buckets"].as_array() { for b in buckets { if let Some(path) = b["key"].as_str() { // Extract directory: "src/orchestrator/mod.rs" → "orchestrator" let parts: Vec<&str> = path.split('/').collect(); if parts.len() >= 2 { let module = parts[parts.len() - 2]; if !modules.contains(&module.to_string()) && module != "src" { modules.push(module.to_string()); } } } } } let type_names = extract_agg_names(&body["aggregations"]["types"]["names"]["buckets"]); let fn_names = extract_agg_names(&body["aggregations"]["functions"]["names"]["buckets"]); let mut result = format!("## project: {repo_name}\n"); if !modules.is_empty() { result.push_str(&format!("modules: {}\n", modules.join(", "))); } if !type_names.is_empty() { result.push_str(&format!("key types: {}\n", type_names.join(", "))); } if !fn_names.is_empty() { result.push_str(&format!("key fns: {}\n", fn_names.join(", "))); } result } /// Hybrid search: _analyze → symbol name matching → BM25 + neural. async fn hybrid_symbol_search( client: &OpenSearch, index: &str, repo_name: &str, branch: &str, user_message: &str, limit: usize, ) -> Vec { // Step 1: Analyze the query to extract key terms let analyze_query = serde_json::json!({ "analyzer": "standard", "text": user_message }); let tokens = match client .indices() .analyze(opensearch::indices::IndicesAnalyzeParts::Index(index)) .body(analyze_query) .send() .await { Ok(r) => { let body: serde_json::Value = r.json().await.unwrap_or_default(); body["tokens"] .as_array() .map(|arr| { arr.iter() .filter_map(|t| t["token"].as_str().map(String::from)) .filter(|t| t.len() > 2) // skip very short tokens .collect::>() }) .unwrap_or_default() } Err(e) => { debug!("Analyze failed (non-fatal): {e}"); Vec::new() } }; // Step 2: Build hybrid query let mut should_clauses = vec![ serde_json::json!({ "match": { "content": user_message } }), serde_json::json!({ "match": { "signature": { "query": user_message, "boost": 2.0 } } }), serde_json::json!({ "match": { "docstring": { "query": user_message, "boost": 1.5 } } }), ]; // Add symbol name term matching from analyzed tokens if !tokens.is_empty() { // Build wildcard patterns from tokens for symbol name matching let patterns: Vec = tokens.iter().map(|t| format!(".*{t}.*")).collect(); should_clauses.push(serde_json::json!({ "regexp": { "symbol_name": { "value": patterns.join("|"), "boost": 3.0 } } })); } let query = serde_json::json!({ "size": limit, "_source": ["file_path", "symbol_name", "symbol_kind", "signature", "docstring", "start_line"], "query": { "bool": { "should": should_clauses, "filter": [ { "term": { "repo_name": repo_name } }, { "bool": { "should": [ { "term": { "branch": { "value": branch, "boost": 2.0 } } }, { "term": { "branch": "mainline" } }, { "term": { "branch": "main" } } ]}} ], "minimum_should_match": 1 } } }); // TODO: Add neural search component when kNN is available on the index. // The hybrid pipeline (tuwunel_hybrid_pipeline) will combine BM25 + neural. // For now, use BM25-only search until embeddings are populated. let response = match client .search(opensearch::SearchParts::Index(&[index])) .body(query) .send() .await { Ok(r) => r, Err(e) => { warn!("Hybrid symbol search failed: {e}"); return Vec::new(); } }; let body: serde_json::Value = match response.json().await { Ok(b) => b, Err(e) => { warn!("Failed to parse search response: {e}"); return Vec::new(); } }; body["hits"]["hits"] .as_array() .map(|hits| { hits.iter() .filter_map(|hit| serde_json::from_value(hit["_source"].clone()).ok()) .collect() }) .unwrap_or_default() } /// Format breadcrumbs within a character budget. fn format_with_budget( outline: &str, relevant: &[RetrievedSymbol], budget: usize, ) -> String { let mut result = outline.to_string(); if relevant.is_empty() || result.len() >= budget { if result.len() > budget { result.truncate(budget); } return result; } result.push_str("## relevant context\n"); for sym in relevant { let entry = format_symbol(sym); if result.len() + entry.len() > budget { break; } result.push_str(&entry); } result } /// Format a single symbol as a breadcrumb entry. fn format_symbol(sym: &RetrievedSymbol) -> String { let mut entry = String::new(); if !sym.docstring.is_empty() { // Take first line of docstring let first_line = sym.docstring.lines().next().unwrap_or(""); entry.push_str(&format!("/// {first_line}\n")); } entry.push_str(&format!( "{} // {}:{}\n", sym.signature, sym.file_path, sym.start_line )); entry } fn extract_agg_names(buckets: &serde_json::Value) -> Vec { buckets .as_array() .map(|arr| { arr.iter() .filter_map(|b| b["key"].as_str().map(String::from)) .collect() }) .unwrap_or_default() } #[cfg(test)] mod tests { use super::*; #[test] fn test_format_symbol_with_docstring() { let sym = RetrievedSymbol { file_path: "src/orchestrator/mod.rs".into(), symbol_name: "generate".into(), symbol_kind: "function".into(), signature: "pub async fn generate(&self, req: &GenerateRequest) -> Option".into(), docstring: "Generate a response using the ConversationRegistry.\nMore details here.".into(), start_line: 80, }; let formatted = format_symbol(&sym); assert!(formatted.contains("/// Generate a response")); assert!(formatted.contains("src/orchestrator/mod.rs:80")); // Only first line of docstring assert!(!formatted.contains("More details")); } #[test] fn test_format_symbol_without_docstring() { let sym = RetrievedSymbol { file_path: "src/main.rs".into(), symbol_name: "main".into(), symbol_kind: "function".into(), signature: "fn main()".into(), docstring: String::new(), start_line: 1, }; let formatted = format_symbol(&sym); assert!(!formatted.contains("///")); assert!(formatted.contains("fn main()")); } #[test] fn test_format_with_budget_truncation() { let outline = "## project: test\nmodules: a, b, c\n"; let symbols = vec![ RetrievedSymbol { file_path: "a.rs".into(), symbol_name: "foo".into(), symbol_kind: "function".into(), signature: "fn foo()".into(), docstring: "Does foo.".into(), start_line: 1, }, RetrievedSymbol { file_path: "b.rs".into(), symbol_name: "bar".into(), symbol_kind: "function".into(), signature: "fn bar()".into(), docstring: "Does bar.".into(), start_line: 1, }, ]; // Budget that fits outline + one symbol but not both let result = format_with_budget(outline, &symbols, 120); assert!(result.contains("foo")); // May or may not contain bar depending on exact lengths } #[test] fn test_format_with_budget_empty_relevant() { let outline = "## project: test\n"; let result = format_with_budget(outline, &[], 1000); assert_eq!(result, outline); assert!(!result.contains("relevant context")); } }