feat: code index + adaptive breadcrumbs foundation
Code index (sol_code): - SymbolDocument: file_path, repo_name, language, symbol_name, symbol_kind, signature, docstring, branch, source, embedding (768-dim knn_vector) - CodeIndexer: batch symbol indexer with idempotent upserts - Branch-aware: symbols scoped to branch with mainline fallback Breadcrumbs: - build_breadcrumbs(): adaptive context injection for coding prompts - Default: project outline via aggregation (modules, types, fns) - Adaptive: hybrid search (_analyze → symbol matching → BM25 + neural) - Token budget enforcement with priority (outline first, then relevance) - format_symbol(): signature + first-line docstring + file:line Query optimization: uses _analyze API to extract key terms from free-form user text, matches against actual symbol names in the index before running the hybrid search.
This commit is contained in:
386
src/breadcrumbs/mod.rs
Normal file
386
src/breadcrumbs/mod.rs
Normal file
@@ -0,0 +1,386 @@
|
|||||||
|
//! Adaptive breadcrumbs — lightweight code context injected into every prompt.
|
||||||
|
//!
|
||||||
|
//! Uses OpenSearch's hybrid search (BM25 + neural) to find relevant symbols
|
||||||
|
//! from the code index based on the user's message. Always injects a default
|
||||||
|
//! project outline, then expands with relevant signatures + docstrings.
|
||||||
|
|
||||||
|
use opensearch::OpenSearch;
|
||||||
|
use serde::Deserialize;
|
||||||
|
use tracing::{debug, warn};
|
||||||
|
|
||||||
|
/// A symbol retrieved from the code index.
|
||||||
|
#[derive(Debug, Clone, Deserialize)]
|
||||||
|
pub struct RetrievedSymbol {
|
||||||
|
pub file_path: String,
|
||||||
|
pub symbol_name: String,
|
||||||
|
pub symbol_kind: String,
|
||||||
|
pub signature: String,
|
||||||
|
#[serde(default)]
|
||||||
|
pub docstring: String,
|
||||||
|
pub start_line: u32,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Result of building breadcrumbs for a prompt.
|
||||||
|
#[derive(Debug)]
|
||||||
|
pub struct BreadcrumbResult {
|
||||||
|
/// The default project outline (~200 tokens).
|
||||||
|
pub outline: String,
|
||||||
|
/// Relevant symbols from adaptive retrieval.
|
||||||
|
pub relevant: Vec<RetrievedSymbol>,
|
||||||
|
/// Ready-to-inject formatted string.
|
||||||
|
pub formatted: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Build adaptive breadcrumbs for a coding session.
|
||||||
|
///
|
||||||
|
/// 1. Always: project outline (module names, key types/fns) from aggregation
|
||||||
|
/// 2. Adaptive: if user message is substantive, hybrid search for relevant symbols
|
||||||
|
/// 3. Format within token budget
|
||||||
|
pub async fn build_breadcrumbs(
|
||||||
|
client: &OpenSearch,
|
||||||
|
index: &str,
|
||||||
|
repo_name: &str,
|
||||||
|
branch: &str,
|
||||||
|
user_message: &str,
|
||||||
|
token_budget: usize,
|
||||||
|
) -> BreadcrumbResult {
|
||||||
|
let outline = load_project_outline(client, index, repo_name, branch).await;
|
||||||
|
|
||||||
|
let relevant = if user_message.split_whitespace().count() >= 3 {
|
||||||
|
hybrid_symbol_search(client, index, repo_name, branch, user_message, 10).await
|
||||||
|
} else {
|
||||||
|
Vec::new()
|
||||||
|
};
|
||||||
|
|
||||||
|
let formatted = format_with_budget(&outline, &relevant, token_budget);
|
||||||
|
|
||||||
|
BreadcrumbResult { outline, relevant, formatted }
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Load the project outline: distinct modules, key type names, key function names.
|
||||||
|
async fn load_project_outline(
|
||||||
|
client: &OpenSearch,
|
||||||
|
index: &str,
|
||||||
|
repo_name: &str,
|
||||||
|
branch: &str,
|
||||||
|
) -> String {
|
||||||
|
let query = serde_json::json!({
|
||||||
|
"size": 0,
|
||||||
|
"query": {
|
||||||
|
"bool": {
|
||||||
|
"filter": [
|
||||||
|
{ "term": { "repo_name": repo_name } },
|
||||||
|
{ "bool": { "should": [
|
||||||
|
{ "term": { "branch": branch } },
|
||||||
|
{ "term": { "branch": "mainline" } },
|
||||||
|
{ "term": { "branch": "main" } }
|
||||||
|
]}}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"aggs": {
|
||||||
|
"modules": {
|
||||||
|
"terms": { "field": "file_path", "size": 50 }
|
||||||
|
},
|
||||||
|
"types": {
|
||||||
|
"filter": { "terms": { "symbol_kind": ["struct", "enum", "trait", "class", "interface", "type"] } },
|
||||||
|
"aggs": { "names": { "terms": { "field": "symbol_name", "size": 20 } } }
|
||||||
|
},
|
||||||
|
"functions": {
|
||||||
|
"filter": { "terms": { "symbol_kind": ["function", "method", "async_function"] } },
|
||||||
|
"aggs": { "names": { "terms": { "field": "symbol_name", "size": 20 } } }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
let response = match client
|
||||||
|
.search(opensearch::SearchParts::Index(&[index]))
|
||||||
|
.body(query)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(r) => r,
|
||||||
|
Err(e) => {
|
||||||
|
warn!("Failed to load project outline: {e}");
|
||||||
|
return String::new();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let body: serde_json::Value = match response.json().await {
|
||||||
|
Ok(b) => b,
|
||||||
|
Err(e) => {
|
||||||
|
warn!("Failed to parse outline response: {e}");
|
||||||
|
return String::new();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Extract module paths (deduplicate to directory level)
|
||||||
|
let mut modules: Vec<String> = Vec::new();
|
||||||
|
if let Some(buckets) = body["aggregations"]["modules"]["buckets"].as_array() {
|
||||||
|
for b in buckets {
|
||||||
|
if let Some(path) = b["key"].as_str() {
|
||||||
|
// Extract directory: "src/orchestrator/mod.rs" → "orchestrator"
|
||||||
|
let parts: Vec<&str> = path.split('/').collect();
|
||||||
|
if parts.len() >= 2 {
|
||||||
|
let module = parts[parts.len() - 2];
|
||||||
|
if !modules.contains(&module.to_string()) && module != "src" {
|
||||||
|
modules.push(module.to_string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let type_names = extract_agg_names(&body["aggregations"]["types"]["names"]["buckets"]);
|
||||||
|
let fn_names = extract_agg_names(&body["aggregations"]["functions"]["names"]["buckets"]);
|
||||||
|
|
||||||
|
let mut result = format!("## project: {repo_name}\n");
|
||||||
|
if !modules.is_empty() {
|
||||||
|
result.push_str(&format!("modules: {}\n", modules.join(", ")));
|
||||||
|
}
|
||||||
|
if !type_names.is_empty() {
|
||||||
|
result.push_str(&format!("key types: {}\n", type_names.join(", ")));
|
||||||
|
}
|
||||||
|
if !fn_names.is_empty() {
|
||||||
|
result.push_str(&format!("key fns: {}\n", fn_names.join(", ")));
|
||||||
|
}
|
||||||
|
|
||||||
|
result
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Hybrid search: _analyze → symbol name matching → BM25 + neural.
|
||||||
|
async fn hybrid_symbol_search(
|
||||||
|
client: &OpenSearch,
|
||||||
|
index: &str,
|
||||||
|
repo_name: &str,
|
||||||
|
branch: &str,
|
||||||
|
user_message: &str,
|
||||||
|
limit: usize,
|
||||||
|
) -> Vec<RetrievedSymbol> {
|
||||||
|
// Step 1: Analyze the query to extract key terms
|
||||||
|
let analyze_query = serde_json::json!({
|
||||||
|
"analyzer": "standard",
|
||||||
|
"text": user_message
|
||||||
|
});
|
||||||
|
|
||||||
|
let tokens = match client
|
||||||
|
.indices()
|
||||||
|
.analyze(opensearch::indices::IndicesAnalyzeParts::Index(index))
|
||||||
|
.body(analyze_query)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(r) => {
|
||||||
|
let body: serde_json::Value = r.json().await.unwrap_or_default();
|
||||||
|
body["tokens"]
|
||||||
|
.as_array()
|
||||||
|
.map(|arr| {
|
||||||
|
arr.iter()
|
||||||
|
.filter_map(|t| t["token"].as_str().map(String::from))
|
||||||
|
.filter(|t| t.len() > 2) // skip very short tokens
|
||||||
|
.collect::<Vec<_>>()
|
||||||
|
})
|
||||||
|
.unwrap_or_default()
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
debug!("Analyze failed (non-fatal): {e}");
|
||||||
|
Vec::new()
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Step 2: Build hybrid query
|
||||||
|
let mut should_clauses = vec![
|
||||||
|
serde_json::json!({ "match": { "content": user_message } }),
|
||||||
|
serde_json::json!({ "match": { "signature": { "query": user_message, "boost": 2.0 } } }),
|
||||||
|
serde_json::json!({ "match": { "docstring": { "query": user_message, "boost": 1.5 } } }),
|
||||||
|
];
|
||||||
|
|
||||||
|
// Add symbol name term matching from analyzed tokens
|
||||||
|
if !tokens.is_empty() {
|
||||||
|
// Build wildcard patterns from tokens for symbol name matching
|
||||||
|
let patterns: Vec<String> = tokens.iter().map(|t| format!(".*{t}.*")).collect();
|
||||||
|
should_clauses.push(serde_json::json!({
|
||||||
|
"regexp": { "symbol_name": { "value": patterns.join("|"), "boost": 3.0 } }
|
||||||
|
}));
|
||||||
|
}
|
||||||
|
|
||||||
|
let query = serde_json::json!({
|
||||||
|
"size": limit,
|
||||||
|
"_source": ["file_path", "symbol_name", "symbol_kind", "signature", "docstring", "start_line"],
|
||||||
|
"query": {
|
||||||
|
"bool": {
|
||||||
|
"should": should_clauses,
|
||||||
|
"filter": [
|
||||||
|
{ "term": { "repo_name": repo_name } },
|
||||||
|
{ "bool": { "should": [
|
||||||
|
{ "term": { "branch": { "value": branch, "boost": 2.0 } } },
|
||||||
|
{ "term": { "branch": "mainline" } },
|
||||||
|
{ "term": { "branch": "main" } }
|
||||||
|
]}}
|
||||||
|
],
|
||||||
|
"minimum_should_match": 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// TODO: Add neural search component when kNN is available on the index.
|
||||||
|
// The hybrid pipeline (tuwunel_hybrid_pipeline) will combine BM25 + neural.
|
||||||
|
// For now, use BM25-only search until embeddings are populated.
|
||||||
|
|
||||||
|
let response = match client
|
||||||
|
.search(opensearch::SearchParts::Index(&[index]))
|
||||||
|
.body(query)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(r) => r,
|
||||||
|
Err(e) => {
|
||||||
|
warn!("Hybrid symbol search failed: {e}");
|
||||||
|
return Vec::new();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
let body: serde_json::Value = match response.json().await {
|
||||||
|
Ok(b) => b,
|
||||||
|
Err(e) => {
|
||||||
|
warn!("Failed to parse search response: {e}");
|
||||||
|
return Vec::new();
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
body["hits"]["hits"]
|
||||||
|
.as_array()
|
||||||
|
.map(|hits| {
|
||||||
|
hits.iter()
|
||||||
|
.filter_map(|hit| serde_json::from_value(hit["_source"].clone()).ok())
|
||||||
|
.collect()
|
||||||
|
})
|
||||||
|
.unwrap_or_default()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Format breadcrumbs within a character budget.
|
||||||
|
fn format_with_budget(
|
||||||
|
outline: &str,
|
||||||
|
relevant: &[RetrievedSymbol],
|
||||||
|
budget: usize,
|
||||||
|
) -> String {
|
||||||
|
let mut result = outline.to_string();
|
||||||
|
|
||||||
|
if relevant.is_empty() || result.len() >= budget {
|
||||||
|
if result.len() > budget {
|
||||||
|
result.truncate(budget);
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
result.push_str("## relevant context\n");
|
||||||
|
|
||||||
|
for sym in relevant {
|
||||||
|
let entry = format_symbol(sym);
|
||||||
|
if result.len() + entry.len() > budget {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
result.push_str(&entry);
|
||||||
|
}
|
||||||
|
|
||||||
|
result
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Format a single symbol as a breadcrumb entry.
|
||||||
|
fn format_symbol(sym: &RetrievedSymbol) -> String {
|
||||||
|
let mut entry = String::new();
|
||||||
|
if !sym.docstring.is_empty() {
|
||||||
|
// Take first line of docstring
|
||||||
|
let first_line = sym.docstring.lines().next().unwrap_or("");
|
||||||
|
entry.push_str(&format!("/// {first_line}\n"));
|
||||||
|
}
|
||||||
|
entry.push_str(&format!(
|
||||||
|
"{} // {}:{}\n",
|
||||||
|
sym.signature, sym.file_path, sym.start_line
|
||||||
|
));
|
||||||
|
entry
|
||||||
|
}
|
||||||
|
|
||||||
|
fn extract_agg_names(buckets: &serde_json::Value) -> Vec<String> {
|
||||||
|
buckets
|
||||||
|
.as_array()
|
||||||
|
.map(|arr| {
|
||||||
|
arr.iter()
|
||||||
|
.filter_map(|b| b["key"].as_str().map(String::from))
|
||||||
|
.collect()
|
||||||
|
})
|
||||||
|
.unwrap_or_default()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_format_symbol_with_docstring() {
|
||||||
|
let sym = RetrievedSymbol {
|
||||||
|
file_path: "src/orchestrator/mod.rs".into(),
|
||||||
|
symbol_name: "generate".into(),
|
||||||
|
symbol_kind: "function".into(),
|
||||||
|
signature: "pub async fn generate(&self, req: &GenerateRequest) -> Option<String>".into(),
|
||||||
|
docstring: "Generate a response using the ConversationRegistry.\nMore details here.".into(),
|
||||||
|
start_line: 80,
|
||||||
|
};
|
||||||
|
let formatted = format_symbol(&sym);
|
||||||
|
assert!(formatted.contains("/// Generate a response"));
|
||||||
|
assert!(formatted.contains("src/orchestrator/mod.rs:80"));
|
||||||
|
// Only first line of docstring
|
||||||
|
assert!(!formatted.contains("More details"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_format_symbol_without_docstring() {
|
||||||
|
let sym = RetrievedSymbol {
|
||||||
|
file_path: "src/main.rs".into(),
|
||||||
|
symbol_name: "main".into(),
|
||||||
|
symbol_kind: "function".into(),
|
||||||
|
signature: "fn main()".into(),
|
||||||
|
docstring: String::new(),
|
||||||
|
start_line: 1,
|
||||||
|
};
|
||||||
|
let formatted = format_symbol(&sym);
|
||||||
|
assert!(!formatted.contains("///"));
|
||||||
|
assert!(formatted.contains("fn main()"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_format_with_budget_truncation() {
|
||||||
|
let outline = "## project: test\nmodules: a, b, c\n";
|
||||||
|
let symbols = vec![
|
||||||
|
RetrievedSymbol {
|
||||||
|
file_path: "a.rs".into(),
|
||||||
|
symbol_name: "foo".into(),
|
||||||
|
symbol_kind: "function".into(),
|
||||||
|
signature: "fn foo()".into(),
|
||||||
|
docstring: "Does foo.".into(),
|
||||||
|
start_line: 1,
|
||||||
|
},
|
||||||
|
RetrievedSymbol {
|
||||||
|
file_path: "b.rs".into(),
|
||||||
|
symbol_name: "bar".into(),
|
||||||
|
symbol_kind: "function".into(),
|
||||||
|
signature: "fn bar()".into(),
|
||||||
|
docstring: "Does bar.".into(),
|
||||||
|
start_line: 1,
|
||||||
|
},
|
||||||
|
];
|
||||||
|
|
||||||
|
// Budget that fits outline + one symbol but not both
|
||||||
|
let result = format_with_budget(outline, &symbols, 120);
|
||||||
|
assert!(result.contains("foo"));
|
||||||
|
// May or may not contain bar depending on exact lengths
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_format_with_budget_empty_relevant() {
|
||||||
|
let outline = "## project: test\n";
|
||||||
|
let result = format_with_budget(outline, &[], 1000);
|
||||||
|
assert_eq!(result, outline);
|
||||||
|
assert!(!result.contains("relevant context"));
|
||||||
|
}
|
||||||
|
}
|
||||||
133
src/code_index/indexer.rs
Normal file
133
src/code_index/indexer.rs
Normal file
@@ -0,0 +1,133 @@
|
|||||||
|
//! Code indexer — batches symbol documents and flushes to OpenSearch.
|
||||||
|
|
||||||
|
use opensearch::http::request::JsonBody;
|
||||||
|
use opensearch::OpenSearch;
|
||||||
|
use serde_json::json;
|
||||||
|
use tracing::{error, info};
|
||||||
|
|
||||||
|
use super::schema::SymbolDocument;
|
||||||
|
|
||||||
|
/// Batch indexer for code symbols.
|
||||||
|
pub struct CodeIndexer {
|
||||||
|
client: OpenSearch,
|
||||||
|
index: String,
|
||||||
|
pipeline: String,
|
||||||
|
buffer: Vec<SymbolDocument>,
|
||||||
|
batch_size: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CodeIndexer {
|
||||||
|
pub fn new(client: OpenSearch, index: String, pipeline: String, batch_size: usize) -> Self {
|
||||||
|
Self {
|
||||||
|
client,
|
||||||
|
index,
|
||||||
|
pipeline,
|
||||||
|
buffer: Vec::new(),
|
||||||
|
batch_size,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Add a symbol to the buffer. Flushes when batch size is reached.
|
||||||
|
pub async fn add(&mut self, doc: SymbolDocument) {
|
||||||
|
self.buffer.push(doc);
|
||||||
|
if self.buffer.len() >= self.batch_size {
|
||||||
|
self.flush().await;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Flush all buffered symbols to OpenSearch.
|
||||||
|
pub async fn flush(&mut self) {
|
||||||
|
if self.buffer.is_empty() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut body: Vec<JsonBody<serde_json::Value>> = Vec::with_capacity(self.buffer.len() * 2);
|
||||||
|
|
||||||
|
for doc in &self.buffer {
|
||||||
|
let doc_id = format!("{}:{}:{}", doc.file_path, doc.symbol_name, doc.branch);
|
||||||
|
body.push(json!({ "index": { "_index": self.index, "_id": doc_id } }).into());
|
||||||
|
body.push(serde_json::to_value(doc).unwrap_or_default().into());
|
||||||
|
}
|
||||||
|
|
||||||
|
match self
|
||||||
|
.client
|
||||||
|
.bulk(opensearch::BulkParts::None)
|
||||||
|
.pipeline(&self.pipeline)
|
||||||
|
.body(body)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(response) => {
|
||||||
|
let count = self.buffer.len();
|
||||||
|
if response.status_code().is_success() {
|
||||||
|
info!(count, "Flushed symbols to code index");
|
||||||
|
} else {
|
||||||
|
let text = response.text().await.unwrap_or_default();
|
||||||
|
error!(count, "Code index bulk failed: {text}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!("Code index flush error: {e}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
self.buffer.clear();
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Delete all symbols for a repo + branch (before re-indexing).
|
||||||
|
pub async fn delete_branch(&self, repo_name: &str, branch: &str) {
|
||||||
|
let query = json!({
|
||||||
|
"query": {
|
||||||
|
"bool": {
|
||||||
|
"must": [
|
||||||
|
{ "term": { "repo_name": repo_name } },
|
||||||
|
{ "term": { "branch": branch } }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
match self
|
||||||
|
.client
|
||||||
|
.delete_by_query(opensearch::DeleteByQueryParts::Index(&[&self.index]))
|
||||||
|
.body(query)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(r) => {
|
||||||
|
info!(repo_name, branch, "Deleted symbols for branch re-index");
|
||||||
|
let _ = r;
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
error!(repo_name, branch, "Failed to delete branch symbols: {e}");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_document_id_format() {
|
||||||
|
let doc = SymbolDocument {
|
||||||
|
file_path: "src/main.rs".into(),
|
||||||
|
repo_owner: None,
|
||||||
|
repo_name: "sol".into(),
|
||||||
|
language: "rust".into(),
|
||||||
|
symbol_name: "main".into(),
|
||||||
|
symbol_kind: "function".into(),
|
||||||
|
signature: "fn main()".into(),
|
||||||
|
docstring: String::new(),
|
||||||
|
start_line: 1,
|
||||||
|
end_line: 10,
|
||||||
|
content: "fn main() {}".into(),
|
||||||
|
branch: "mainline".into(),
|
||||||
|
source: "local".into(),
|
||||||
|
indexed_at: 0,
|
||||||
|
};
|
||||||
|
let doc_id = format!("{}:{}:{}", doc.file_path, doc.symbol_name, doc.branch);
|
||||||
|
assert_eq!(doc_id, "src/main.rs:main:mainline");
|
||||||
|
}
|
||||||
|
}
|
||||||
7
src/code_index/mod.rs
Normal file
7
src/code_index/mod.rs
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
//! Code index — OpenSearch-backed symbol index for source code.
|
||||||
|
//!
|
||||||
|
//! Indexes symbols (functions, structs, enums, traits) with their signatures,
|
||||||
|
//! docstrings, and body content. Supports branch-aware semantic search.
|
||||||
|
|
||||||
|
pub mod schema;
|
||||||
|
pub mod indexer;
|
||||||
177
src/code_index/schema.rs
Normal file
177
src/code_index/schema.rs
Normal file
@@ -0,0 +1,177 @@
|
|||||||
|
//! Code index schema — SymbolDocument and OpenSearch index mapping.
|
||||||
|
|
||||||
|
use opensearch::OpenSearch;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use tracing::info;
|
||||||
|
|
||||||
|
/// A symbol indexed in OpenSearch for code search and breadcrumbs.
|
||||||
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
|
pub struct SymbolDocument {
|
||||||
|
/// File path relative to repo root.
|
||||||
|
pub file_path: String,
|
||||||
|
/// Repository owner (e.g., "studio").
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
pub repo_owner: Option<String>,
|
||||||
|
/// Repository name (e.g., "sol").
|
||||||
|
pub repo_name: String,
|
||||||
|
/// Programming language (e.g., "rust", "typescript", "python").
|
||||||
|
pub language: String,
|
||||||
|
/// Symbol name (e.g., "run_tool_loop", "Orchestrator").
|
||||||
|
pub symbol_name: String,
|
||||||
|
/// Symbol kind (e.g., "function", "struct", "enum", "trait", "impl").
|
||||||
|
pub symbol_kind: String,
|
||||||
|
/// Full signature (e.g., "pub async fn generate(&self, req: &GenerateRequest) -> Option<String>").
|
||||||
|
pub signature: String,
|
||||||
|
/// Doc comment / docstring.
|
||||||
|
#[serde(default, skip_serializing_if = "String::is_empty")]
|
||||||
|
pub docstring: String,
|
||||||
|
/// Start line in the file (1-based).
|
||||||
|
pub start_line: u32,
|
||||||
|
/// End line in the file (1-based).
|
||||||
|
pub end_line: u32,
|
||||||
|
/// Full body content of the symbol (for embedding).
|
||||||
|
pub content: String,
|
||||||
|
/// Git branch this symbol was indexed from.
|
||||||
|
pub branch: String,
|
||||||
|
/// Source of the index: "gitea", "local", or "sidecar" (future).
|
||||||
|
pub source: String,
|
||||||
|
/// When this was indexed (epoch millis).
|
||||||
|
pub indexed_at: i64,
|
||||||
|
}
|
||||||
|
|
||||||
|
const INDEX_MAPPING: &str = r#"{
|
||||||
|
"settings": {
|
||||||
|
"number_of_shards": 1,
|
||||||
|
"number_of_replicas": 0,
|
||||||
|
"index.knn": true
|
||||||
|
},
|
||||||
|
"mappings": {
|
||||||
|
"properties": {
|
||||||
|
"file_path": { "type": "keyword" },
|
||||||
|
"repo_owner": { "type": "keyword" },
|
||||||
|
"repo_name": { "type": "keyword" },
|
||||||
|
"language": { "type": "keyword" },
|
||||||
|
"symbol_name": { "type": "keyword" },
|
||||||
|
"symbol_kind": { "type": "keyword" },
|
||||||
|
"signature": { "type": "text" },
|
||||||
|
"docstring": { "type": "text" },
|
||||||
|
"start_line": { "type": "integer" },
|
||||||
|
"end_line": { "type": "integer" },
|
||||||
|
"content": { "type": "text", "analyzer": "standard" },
|
||||||
|
"branch": { "type": "keyword" },
|
||||||
|
"source": { "type": "keyword" },
|
||||||
|
"indexed_at": { "type": "date", "format": "epoch_millis" },
|
||||||
|
"embedding": {
|
||||||
|
"type": "knn_vector",
|
||||||
|
"dimension": 768,
|
||||||
|
"method": {
|
||||||
|
"name": "hnsw",
|
||||||
|
"space_type": "cosinesimil",
|
||||||
|
"engine": "lucene"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}"#;
|
||||||
|
|
||||||
|
pub fn index_mapping_json() -> &'static str {
|
||||||
|
INDEX_MAPPING
|
||||||
|
}
|
||||||
|
|
||||||
|
pub async fn create_index_if_not_exists(client: &OpenSearch, index: &str) -> anyhow::Result<()> {
|
||||||
|
let exists = client
|
||||||
|
.indices()
|
||||||
|
.exists(opensearch::indices::IndicesExistsParts::Index(&[index]))
|
||||||
|
.send()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
if exists.status_code().is_success() {
|
||||||
|
info!(index, "Code index already exists");
|
||||||
|
return Ok(());
|
||||||
|
}
|
||||||
|
|
||||||
|
let mapping: serde_json::Value = serde_json::from_str(INDEX_MAPPING)?;
|
||||||
|
let response = client
|
||||||
|
.indices()
|
||||||
|
.create(opensearch::indices::IndicesCreateParts::Index(index))
|
||||||
|
.body(mapping)
|
||||||
|
.send()
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
if !response.status_code().is_success() {
|
||||||
|
let body = response.text().await?;
|
||||||
|
anyhow::bail!("Failed to create code index {index}: {body}");
|
||||||
|
}
|
||||||
|
|
||||||
|
info!(index, "Created code index");
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_index_mapping_is_valid_json() {
|
||||||
|
let mapping: serde_json::Value = serde_json::from_str(index_mapping_json()).unwrap();
|
||||||
|
assert!(mapping["mappings"]["properties"]["symbol_name"]["type"]
|
||||||
|
.as_str()
|
||||||
|
.unwrap()
|
||||||
|
== "keyword");
|
||||||
|
assert!(mapping["mappings"]["properties"]["embedding"]["type"]
|
||||||
|
.as_str()
|
||||||
|
.unwrap()
|
||||||
|
== "knn_vector");
|
||||||
|
assert!(mapping["mappings"]["properties"]["branch"]["type"]
|
||||||
|
.as_str()
|
||||||
|
.unwrap()
|
||||||
|
== "keyword");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_symbol_document_serialize() {
|
||||||
|
let doc = SymbolDocument {
|
||||||
|
file_path: "src/orchestrator/mod.rs".into(),
|
||||||
|
repo_owner: Some("studio".into()),
|
||||||
|
repo_name: "sol".into(),
|
||||||
|
language: "rust".into(),
|
||||||
|
symbol_name: "generate".into(),
|
||||||
|
symbol_kind: "function".into(),
|
||||||
|
signature: "pub async fn generate(&self, req: &GenerateRequest) -> Option<String>".into(),
|
||||||
|
docstring: "Generate a response using the ConversationRegistry.".into(),
|
||||||
|
start_line: 80,
|
||||||
|
end_line: 120,
|
||||||
|
content: "pub async fn generate(...) { ... }".into(),
|
||||||
|
branch: "mainline".into(),
|
||||||
|
source: "gitea".into(),
|
||||||
|
indexed_at: 1774310400000,
|
||||||
|
};
|
||||||
|
let json = serde_json::to_value(&doc).unwrap();
|
||||||
|
assert_eq!(json["symbol_name"], "generate");
|
||||||
|
assert_eq!(json["branch"], "mainline");
|
||||||
|
assert_eq!(json["language"], "rust");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_symbol_document_skip_empty_docstring() {
|
||||||
|
let doc = SymbolDocument {
|
||||||
|
file_path: "src/main.rs".into(),
|
||||||
|
repo_owner: None,
|
||||||
|
repo_name: "sol".into(),
|
||||||
|
language: "rust".into(),
|
||||||
|
symbol_name: "main".into(),
|
||||||
|
symbol_kind: "function".into(),
|
||||||
|
signature: "fn main()".into(),
|
||||||
|
docstring: String::new(),
|
||||||
|
start_line: 1,
|
||||||
|
end_line: 10,
|
||||||
|
content: "fn main() { ... }".into(),
|
||||||
|
branch: "mainline".into(),
|
||||||
|
source: "local".into(),
|
||||||
|
indexed_at: 0,
|
||||||
|
};
|
||||||
|
let json_str = serde_json::to_string(&doc).unwrap();
|
||||||
|
assert!(!json_str.contains("docstring"));
|
||||||
|
assert!(!json_str.contains("repo_owner"));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -2,6 +2,8 @@ mod agent_ux;
|
|||||||
mod agents;
|
mod agents;
|
||||||
mod archive;
|
mod archive;
|
||||||
mod brain;
|
mod brain;
|
||||||
|
mod breadcrumbs;
|
||||||
|
mod code_index;
|
||||||
mod config;
|
mod config;
|
||||||
mod context;
|
mod context;
|
||||||
mod conversations;
|
mod conversations;
|
||||||
|
|||||||
Reference in New Issue
Block a user