sol/src/breadcrumbs/mod.rs

//! Adaptive breadcrumbs — lightweight code context injected into every prompt.
//!
//! Uses OpenSearch's hybrid search (BM25 + neural) to find relevant symbols
//! from the code index based on the user's message. Always injects a default
//! project outline, then expands with relevant signatures + docstrings.

use opensearch::OpenSearch;
use serde::Deserialize;
use tracing::{debug, warn};

/// A symbol retrieved from the code index.
#[derive(Debug, Clone, Deserialize)]
pub struct RetrievedSymbol {
    pub file_path: String,
    pub symbol_name: String,
    pub symbol_kind: String,
    pub signature: String,
    #[serde(default)]
    pub docstring: String,
    pub start_line: u32,
}

/// Result of building breadcrumbs for a prompt.
#[derive(Debug)]
pub struct BreadcrumbResult {
    /// The default project outline (~200 tokens).
    pub outline: String,
    /// Relevant symbols from adaptive retrieval.
    pub relevant: Vec<RetrievedSymbol>,
    /// Ready-to-inject formatted string.
    pub formatted: String,
}

/// Build adaptive breadcrumbs for a coding session.
///
/// 1. Always: project outline (module names, key types/fns) from aggregation
/// 2. Adaptive: if user message is substantive, hybrid search for relevant symbols
/// 3. Format within token budget
pub async fn build_breadcrumbs(
    client: &OpenSearch,
    index: &str,
    repo_name: &str,
    branch: &str,
    user_message: &str,
    token_budget: usize,
) -> BreadcrumbResult {
    let outline = load_project_outline(client, index, repo_name, branch).await;

    let relevant = if user_message.split_whitespace().count() >= 3 {
        hybrid_symbol_search(client, index, repo_name, branch, user_message, 10).await
    } else {
        Vec::new()
    };

    let formatted = format_with_budget(&outline, &relevant, token_budget);

    BreadcrumbResult { outline, relevant, formatted }
}

/// Load the project outline: distinct modules, key type names, key function names.
async fn load_project_outline(
    client: &OpenSearch,
    index: &str,
    repo_name: &str,
    branch: &str,
) -> String {
    let query = serde_json::json!({
        "size": 0,
        "query": {
            "bool": {
                "filter": [
                    { "term": { "repo_name": repo_name } },
                    { "bool": { "should": [
                        { "term": { "branch": branch } },
                        { "term": { "branch": "mainline" } },
                        { "term": { "branch": "main" } }
                    ]}}
                ]
            }
        },
        "aggs": {
            "modules": {
                "terms": { "field": "file_path", "size": 50 }
            },
            "types": {
                "filter": { "terms": { "symbol_kind": ["struct", "enum", "trait", "class", "interface", "type"] } },
                "aggs": { "names": { "terms": { "field": "symbol_name", "size": 20 } } }
            },
            "functions": {
                "filter": { "terms": { "symbol_kind": ["function", "method", "async_function"] } },
                "aggs": { "names": { "terms": { "field": "symbol_name", "size": 20 } } }
            }
        }
    });

    let response = match client
        .search(opensearch::SearchParts::Index(&[index]))
        .body(query)
        .send()
        .await
    {
        Ok(r) => r,
        Err(e) => {
            warn!("Failed to load project outline: {e}");
            return String::new();
        }
    };

    let body: serde_json::Value = match response.json().await {
        Ok(b) => b,
        Err(e) => {
            warn!("Failed to parse outline response: {e}");
            return String::new();
        }
    };

    // Extract module paths (deduplicate to directory level)
    let mut modules: Vec<String> = Vec::new();
    if let Some(buckets) = body["aggregations"]["modules"]["buckets"].as_array() {
        for b in buckets {
            if let Some(path) = b["key"].as_str() {
                // Extract directory: "src/orchestrator/mod.rs" → "orchestrator"
                let parts: Vec<&str> = path.split('/').collect();
                if parts.len() >= 2 {
                    let module = parts[parts.len() - 2];
                    if !modules.contains(&module.to_string()) && module != "src" {
                        modules.push(module.to_string());
                    }
                }
            }
        }
    }

    let type_names = extract_agg_names(&body["aggregations"]["types"]["names"]["buckets"]);
    let fn_names = extract_agg_names(&body["aggregations"]["functions"]["names"]["buckets"]);

    let mut result = format!("## project: {repo_name}\n");
    if !modules.is_empty() {
        result.push_str(&format!("modules: {}\n", modules.join(", ")));
    }
    if !type_names.is_empty() {
        result.push_str(&format!("key types: {}\n", type_names.join(", ")));
    }
    if !fn_names.is_empty() {
        result.push_str(&format!("key fns: {}\n", fn_names.join(", ")));
    }

    result
}

/// Hybrid search: _analyze → symbol name matching → BM25 + neural.
async fn hybrid_symbol_search(
    client: &OpenSearch,
    index: &str,
    repo_name: &str,
    branch: &str,
    user_message: &str,
    limit: usize,
) -> Vec<RetrievedSymbol> {
    // Step 1: Analyze the query to extract key terms
    let analyze_query = serde_json::json!({
        "analyzer": "standard",
        "text": user_message
    });

    let tokens = match client
        .indices()
        .analyze(opensearch::indices::IndicesAnalyzeParts::Index(index))
        .body(analyze_query)
        .send()
        .await
    {
        Ok(r) => {
            let body: serde_json::Value = r.json().await.unwrap_or_default();
            body["tokens"]
                .as_array()
                .map(|arr| {
                    arr.iter()
                        .filter_map(|t| t["token"].as_str().map(String::from))
                        .filter(|t| t.len() > 2) // skip very short tokens
                        .collect::<Vec<_>>()
                })
                .unwrap_or_default()
        }
        Err(e) => {
            debug!("Analyze failed (non-fatal): {e}");
            Vec::new()
        }
    };

    // Step 2: Build hybrid query
    let mut should_clauses = vec![
        serde_json::json!({ "match": { "content": user_message } }),
        serde_json::json!({ "match": { "signature": { "query": user_message, "boost": 2.0 } } }),
        serde_json::json!({ "match": { "docstring": { "query": user_message, "boost": 1.5 } } }),
    ];

    // Add symbol name term matching from analyzed tokens
    if !tokens.is_empty() {
        // Build wildcard patterns from tokens for symbol name matching
        let patterns: Vec<String> = tokens.iter().map(|t| format!(".*{t}.*")).collect();
        should_clauses.push(serde_json::json!({
            "regexp": { "symbol_name": { "value": patterns.join("|"), "boost": 3.0 } }
        }));
    }

    let query = serde_json::json!({
        "size": limit,
        "_source": ["file_path", "symbol_name", "symbol_kind", "signature", "docstring", "start_line"],
        "query": {
            "bool": {
                "should": should_clauses,
                "filter": [
                    { "term": { "repo_name": repo_name } },
                    { "bool": { "should": [
                        { "term": { "branch": { "value": branch, "boost": 2.0 } } },
                        { "term": { "branch": "mainline" } },
                        { "term": { "branch": "main" } }
                    ]}}
                ],
                "minimum_should_match": 1
            }
        }
    });

    // TODO: Add neural search component when kNN is available on the index.
    // The hybrid pipeline (tuwunel_hybrid_pipeline) will combine BM25 + neural.
    // For now, use BM25-only search until embeddings are populated.

    let response = match client
        .search(opensearch::SearchParts::Index(&[index]))
        .body(query)
        .send()
        .await
    {
        Ok(r) => r,
        Err(e) => {
            warn!("Hybrid symbol search failed: {e}");
            return Vec::new();
        }
    };

    let body: serde_json::Value = match response.json().await {
        Ok(b) => b,
        Err(e) => {
            warn!("Failed to parse search response: {e}");
            return Vec::new();
        }
    };

    body["hits"]["hits"]
        .as_array()
        .map(|hits| {
            hits.iter()
                .filter_map(|hit| serde_json::from_value(hit["_source"].clone()).ok())
                .collect()
        })
        .unwrap_or_default()
}

/// Format breadcrumbs within a character budget.
fn format_with_budget(
    outline: &str,
    relevant: &[RetrievedSymbol],
    budget: usize,
) -> String {
    let mut result = outline.to_string();

    if relevant.is_empty() || result.len() >= budget {
        if result.len() > budget {
            result.truncate(budget);
        }
        return result;
    }

    result.push_str("## relevant context\n");

    for sym in relevant {
        let entry = format_symbol(sym);
        if result.len() + entry.len() > budget {
            break;
        }
        result.push_str(&entry);
    }

    result
}

/// Format a single symbol as a breadcrumb entry.
fn format_symbol(sym: &RetrievedSymbol) -> String {
    let mut entry = String::new();
    if !sym.docstring.is_empty() {
        // Take first line of docstring
        let first_line = sym.docstring.lines().next().unwrap_or("");
        entry.push_str(&format!("/// {first_line}\n"));
    }
    entry.push_str(&format!(
        "{} // {}:{}\n",
        sym.signature, sym.file_path, sym.start_line
    ));
    entry
}

fn extract_agg_names(buckets: &serde_json::Value) -> Vec<String> {
    buckets
        .as_array()
        .map(|arr| {
            arr.iter()
                .filter_map(|b| b["key"].as_str().map(String::from))
                .collect()
        })
        .unwrap_or_default()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_format_symbol_with_docstring() {
        let sym = RetrievedSymbol {
            file_path: "src/orchestrator/mod.rs".into(),
            symbol_name: "generate".into(),
            symbol_kind: "function".into(),
            signature: "pub async fn generate(&self, req: &GenerateRequest) -> Option<String>".into(),
            docstring: "Generate a response using the ConversationRegistry.\nMore details here.".into(),
            start_line: 80,
        };
        let formatted = format_symbol(&sym);
        assert!(formatted.contains("/// Generate a response"));
        assert!(formatted.contains("src/orchestrator/mod.rs:80"));
        // Only first line of docstring
        assert!(!formatted.contains("More details"));
    }

    #[test]
    fn test_format_symbol_without_docstring() {
        let sym = RetrievedSymbol {
            file_path: "src/main.rs".into(),
            symbol_name: "main".into(),
            symbol_kind: "function".into(),
            signature: "fn main()".into(),
            docstring: String::new(),
            start_line: 1,
        };
        let formatted = format_symbol(&sym);
        assert!(!formatted.contains("///"));
        assert!(formatted.contains("fn main()"));
    }

    #[test]
    fn test_format_with_budget_truncation() {
        let outline = "## project: test\nmodules: a, b, c\n";
        let symbols = vec![
            RetrievedSymbol {
                file_path: "a.rs".into(),
                symbol_name: "foo".into(),
                symbol_kind: "function".into(),
                signature: "fn foo()".into(),
                docstring: "Does foo.".into(),
                start_line: 1,
            },
            RetrievedSymbol {
                file_path: "b.rs".into(),
                symbol_name: "bar".into(),
                symbol_kind: "function".into(),
                signature: "fn bar()".into(),
                docstring: "Does bar.".into(),
                start_line: 1,
            },
        ];

        // Budget that fits outline + one symbol but not both
        let result = format_with_budget(outline, &symbols, 120);
        assert!(result.contains("foo"));
        // May or may not contain bar depending on exact lengths
    }

    #[test]
    fn test_format_with_budget_empty_relevant() {
        let outline = "## project: test\n";
        let result = format_with_budget(outline, &[], 1000);
        assert_eq!(result, outline);
        assert!(!result.contains("relevant context"));
    }
}