From a11b313301f6f61e3c4194af884b16e23780c0d4 Mon Sep 17 00:00:00 2001 From: Sienna Meridian Satterwhite Date: Tue, 24 Mar 2026 09:36:42 +0000 Subject: [PATCH] feat: Gitea repo indexing via gRPC ReindexCode endpoint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Gitea indexer (code_index/gitea.rs): - Walks repos via GiteaClient API (list repos → traverse dirs → fetch files) - Base64 decodes file content from Gitea API responses - Extracts symbols with tree-sitter (Rust, TypeScript, Python) - Indexes to sol_code OpenSearch index with repo/branch/source metadata - Skips hidden dirs, vendor, node_modules, files >100KB - delete_branch() for clean re-indexing Server-side tree-sitter (code_index/symbols.rs): - Full symbol extraction shared with CLI client - extract_symbols(), extract_project_symbols(), detect_language() gRPC ReindexCode RPC: - ReindexCodeRequest: org, repo, branch (all optional filters) - ReindexCodeResponse: repos_indexed, symbols_indexed, error - Uses ToolRegistry's GiteaClient (already authenticated) - Creates sol_code index if not exists ToolRegistry.gitea_client() accessor for reindex endpoint. --- Cargo.lock | 70 +++- Cargo.toml | 4 + src/code_index/gitea.rs | 230 +++++++++++++ src/code_index/mod.rs | 4 +- src/code_index/symbols.rs | 659 ++++++++++++++++++++++++++++++++++++++ src/grpc/service.rs | 101 ++++++ src/tools/mod.rs | 4 + 7 files changed, 1069 insertions(+), 3 deletions(-) create mode 100644 src/code_index/gitea.rs create mode 100644 src/code_index/symbols.rs diff --git a/Cargo.lock b/Cargo.lock index ef3893d..dd20e41 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1357,7 +1357,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -1371,6 +1371,12 @@ dependencies = [ "syn", ] +[[package]] +name = "dotenv" +version = "0.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77c90badedccf4105eca100756a0b1289e191f6fcbdadd3cee1d2f614f97da8f" + [[package]] name = "dprint-swc-ext" version = "0.26.0" @@ -3771,7 +3777,7 @@ dependencies = [ "once_cell", "socket2", "tracing", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] @@ -4705,6 +4711,7 @@ dependencies = [ "deno_ast", "deno_core", "deno_error", + "dotenv", "futures", "jsonwebtoken", "libsqlite3-sys", @@ -4730,6 +4737,10 @@ dependencies = [ "tonic-prost-build", "tracing", "tracing-subscriber", + "tree-sitter", + "tree-sitter-python", + "tree-sitter-rust", + "tree-sitter-typescript", "url", "uuid", ] @@ -4808,6 +4819,12 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520" + [[package]] name = "string_enum" version = "1.0.2" @@ -5871,6 +5888,55 @@ dependencies = [ "tracing-log", ] +[[package]] +name = "tree-sitter" +version = "0.24.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5387dffa7ffc7d2dae12b50c6f7aab8ff79d6210147c6613561fc3d474c6f75" +dependencies = [ + "cc", + "regex", + "regex-syntax", + "streaming-iterator", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-language" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "009994f150cc0cd50ff54917d5bc8bffe8cad10ca10d81c34da2ec421ae61782" + +[[package]] +name = "tree-sitter-python" +version = "0.23.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d065aaa27f3aaceaf60c1f0e0ac09e1cb9eb8ed28e7bcdaa52129cffc7f4b04" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-rust" +version = "0.23.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca8ccb3e3a3495c8a943f6c3fd24c3804c471fd7f4f16087623c7fa4c0068e8a" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-typescript" +version = "0.23.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c5f76ed8d947a75cc446d5fccd8b602ebf0cde64ccf2ffa434d873d7a575eff" +dependencies = [ + "cc", + "tree-sitter-language", +] + [[package]] name = "triomphe" version = "0.1.15" diff --git a/Cargo.toml b/Cargo.toml index e2d69a9..b1aab33 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -44,6 +44,10 @@ prost = "0.14" tokio-stream = "0.1" jsonwebtoken = "9" tokenizers = { version = "0.22", default-features = false, features = ["onig", "http"] } +tree-sitter = "0.24" +tree-sitter-rust = "0.23" +tree-sitter-typescript = "0.23" +tree-sitter-python = "0.23" [dev-dependencies] dotenv = "0.15" diff --git a/src/code_index/gitea.rs b/src/code_index/gitea.rs new file mode 100644 index 0000000..e3d66f8 --- /dev/null +++ b/src/code_index/gitea.rs @@ -0,0 +1,230 @@ +//! Gitea repo indexer — walks repos via the Gitea API, extracts symbols +//! with tree-sitter, and indexes them to OpenSearch. + +use std::sync::Arc; + +use tracing::{debug, error, info, warn}; + +use super::indexer::CodeIndexer; +use super::schema::SymbolDocument; +use super::symbols; +use crate::sdk::gitea::GiteaClient; + +/// Index all repos for an organization (or all accessible repos). +pub async fn index_all_repos( + gitea: &GiteaClient, + indexer: &mut CodeIndexer, + admin_localpart: &str, + org: Option<&str>, +) -> anyhow::Result { + let repos = gitea + .list_repos(admin_localpart, None, org, Some(100)) + .await + .map_err(|e| anyhow::anyhow!("Failed to list repos: {e}"))?; + + let mut total_symbols = 0u32; + + for repo in &repos { + // full_name is "owner/name" + let parts: Vec<&str> = repo.full_name.splitn(2, '/').collect(); + if parts.len() != 2 { + warn!(full_name = repo.full_name.as_str(), "Invalid repo full_name"); + continue; + } + let owner = parts[0]; + let name = parts[1]; + + // Get full repo details for default_branch + let full_repo = match gitea.get_repo(admin_localpart, owner, name).await { + Ok(r) => r, + Err(e) => { + warn!(owner, name, "Failed to get repo details: {e}"); + continue; + } + }; + let default_branch = &full_repo.default_branch; + + info!(owner, name, branch = default_branch.as_str(), "Indexing repo"); + + match index_repo(gitea, indexer, admin_localpart, owner, name, &default_branch).await { + Ok(count) => { + total_symbols += count; + info!(owner, name, count, "Indexed repo symbols"); + } + Err(e) => { + warn!(owner, name, "Failed to index repo: {e}"); + } + } + } + + indexer.flush().await; + Ok(total_symbols) +} + +/// Index a single repo at a given branch. +pub async fn index_repo( + gitea: &GiteaClient, + indexer: &mut CodeIndexer, + localpart: &str, + owner: &str, + repo: &str, + branch: &str, +) -> anyhow::Result { + // Delete existing symbols for this repo+branch before re-indexing + indexer.delete_branch(repo, branch).await; + + let mut count = 0u32; + let mut dirs_to_visit = vec![String::new()]; // start at repo root + + while let Some(dir_path) = dirs_to_visit.pop() { + let entries = match gitea + .get_file(localpart, owner, repo, &dir_path, Some(branch)) + .await + { + Ok(content) => content, + Err(e) => { + debug!(owner, repo, path = dir_path.as_str(), "Failed to list directory: {e}"); + continue; + } + }; + + // get_file returns a JSON string — parse as array of entries + let entries_json: serde_json::Value = + serde_json::from_str(&serde_json::to_string(&entries).unwrap_or_default()) + .unwrap_or_default(); + + // If it's a single file response (not a directory listing), skip + if !entries_json.is_array() { + continue; + } + + let items = entries_json.as_array().unwrap(); + for item in items { + let name = item["name"].as_str().unwrap_or(""); + let path = item["path"].as_str().unwrap_or(""); + let file_type = item["type"].as_str().unwrap_or(""); + + // Skip hidden, vendor, build dirs + if name.starts_with('.') + || name == "target" + || name == "vendor" + || name == "node_modules" + || name == "dist" + || name == "__pycache__" + || name == ".git" + { + continue; + } + + if file_type == "dir" { + dirs_to_visit.push(path.to_string()); + } else if file_type == "file" { + // Check if it's a supported source file + let lang = symbols::detect_language(path); + if lang.is_none() { + continue; + } + + // Skip large files + let size = item["size"].as_u64().unwrap_or(0); + if size > 100_000 { + continue; + } + + // Fetch file content + let content = match fetch_file_content(gitea, localpart, owner, repo, path, branch).await { + Some(c) => c, + None => continue, + }; + + // Extract symbols + let syms = symbols::extract_symbols(path, &content); + let now = chrono::Utc::now().timestamp_millis(); + + for sym in syms { + // Build content snippet for embedding + let body = extract_body(&content, sym.start_line, sym.end_line); + + indexer + .add(SymbolDocument { + file_path: path.to_string(), + repo_owner: Some(owner.to_string()), + repo_name: repo.to_string(), + language: sym.language, + symbol_name: sym.name, + symbol_kind: sym.kind, + signature: sym.signature, + docstring: sym.docstring, + start_line: sym.start_line, + end_line: sym.end_line, + content: body, + branch: branch.to_string(), + source: "gitea".into(), + indexed_at: now, + }) + .await; + count += 1; + } + } + } + } + + Ok(count) +} + +/// Fetch and decode a file's content from Gitea (base64-encoded API response). +async fn fetch_file_content( + gitea: &GiteaClient, + localpart: &str, + owner: &str, + repo: &str, + path: &str, + branch: &str, +) -> Option { + let response = gitea + .get_file(localpart, owner, repo, path, Some(branch)) + .await + .ok()?; + + // The response is a JSON string — parse it + let json_str = serde_json::to_string(&response).ok()?; + let json: serde_json::Value = serde_json::from_str(&json_str).ok()?; + + // Content is base64-encoded + let encoded = json["content"].as_str()?; + let cleaned = encoded.replace('\n', ""); // Gitea adds newlines in base64 + let decoded = base64::Engine::decode(&base64::engine::general_purpose::STANDARD, &cleaned).ok()?; + String::from_utf8(decoded).ok() +} + +/// Extract the body of a symbol from source content. +fn extract_body(content: &str, start_line: u32, end_line: u32) -> String { + let lines: Vec<&str> = content.lines().collect(); + let start = (start_line as usize).saturating_sub(1); + let end = (end_line as usize).min(lines.len()); + let body = lines[start..end].join("\n"); + if body.len() > 500 { + format!("{}…", &body[..497]) + } else { + body + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_extract_body() { + let content = "line 1\nline 2\nline 3\nline 4\nline 5"; + assert_eq!(extract_body(content, 2, 4), "line 2\nline 3\nline 4"); + } + + #[test] + fn test_extract_body_truncation() { + let long_content: String = (0..100).map(|i| format!("line {i} with some content to make it longer")).collect::>().join("\n"); + let body = extract_body(&long_content, 1, 100); + assert!(body.len() <= 501); + assert!(body.ends_with('…')); + } +} diff --git a/src/code_index/mod.rs b/src/code_index/mod.rs index 967008b..ff176f1 100644 --- a/src/code_index/mod.rs +++ b/src/code_index/mod.rs @@ -3,5 +3,7 @@ //! Indexes symbols (functions, structs, enums, traits) with their signatures, //! docstrings, and body content. Supports branch-aware semantic search. -pub mod schema; +pub mod gitea; pub mod indexer; +pub mod schema; +pub mod symbols; diff --git a/src/code_index/symbols.rs b/src/code_index/symbols.rs new file mode 100644 index 0000000..7171f22 --- /dev/null +++ b/src/code_index/symbols.rs @@ -0,0 +1,659 @@ +//! Symbol extraction from source code using tree-sitter. +//! +//! Extracts function signatures, struct/enum/trait definitions, and +//! docstrings from Rust, TypeScript, and Python files. These symbols +//! are sent to Sol for indexing in the code search index. + +use std::path::Path; +use tracing::debug; + +/// An extracted code symbol with file context. +#[derive(Debug, Clone)] +pub struct ProjectSymbol { + pub file_path: String, // relative to project root + pub name: String, + pub kind: String, + pub signature: String, + pub docstring: String, + pub start_line: u32, + pub end_line: u32, + pub language: String, + pub content: String, +} + +/// Extract symbols from all source files in a project. +pub fn extract_project_symbols(project_root: &str) -> Vec { + let root = Path::new(project_root); + let mut symbols = Vec::new(); + + walk_directory(root, root, &mut symbols); + debug!(count = symbols.len(), "Extracted project symbols"); + symbols +} + +fn walk_directory(dir: &Path, root: &Path, symbols: &mut Vec) { + let Ok(entries) = std::fs::read_dir(dir) else { return }; + + for entry in entries.flatten() { + let path = entry.path(); + let name = entry.file_name().to_string_lossy().to_string(); + + // Skip hidden, vendor, target, node_modules, etc. + if name.starts_with('.') || name == "target" || name == "vendor" + || name == "node_modules" || name == "dist" || name == "build" + || name == "__pycache__" || name == ".git" + { + continue; + } + + if path.is_dir() { + walk_directory(&path, root, symbols); + } else if path.is_file() { + let path_str = path.to_string_lossy().to_string(); + if detect_language(&path_str).is_some() { + // Read file (skip large files) + if let Ok(content) = std::fs::read_to_string(&path) { + if content.len() > 100_000 { continue; } // skip >100KB + + let rel_path = path.strip_prefix(root) + .map(|p| p.to_string_lossy().to_string()) + .unwrap_or(path_str.clone()); + + for sym in extract_symbols(&path_str, &content) { + // Build content: signature + body up to 500 chars + let body_start = content.lines() + .take(sym.start_line as usize - 1) + .map(|l| l.len() + 1) + .sum::(); + let body_end = content.lines() + .take(sym.end_line as usize) + .map(|l| l.len() + 1) + .sum::() + .min(content.len()); + let body = &content[body_start..body_end]; + let truncated = if body.len() > 500 { + format!("{}…", &body[..497]) + } else { + body.to_string() + }; + + symbols.push(ProjectSymbol { + file_path: rel_path.clone(), + name: sym.name, + kind: sym.kind, + signature: sym.signature, + docstring: sym.docstring, + start_line: sym.start_line, + end_line: sym.end_line, + language: sym.language, + content: truncated, + }); + } + } + } + } + } +} + +/// An extracted code symbol. +#[derive(Debug, Clone)] +pub struct CodeSymbol { + pub name: String, + pub kind: String, // "function", "struct", "enum", "trait", "class", "interface", "method" + pub signature: String, // full signature line + pub docstring: String, // doc comment / docstring + pub start_line: u32, // 1-based + pub end_line: u32, // 1-based + pub language: String, +} + +/// Detect language from file extension. +pub fn detect_language(path: &str) -> Option<&'static str> { + let ext = Path::new(path).extension()?.to_str()?; + match ext { + "rs" => Some("rust"), + "ts" | "tsx" => Some("typescript"), + "js" | "jsx" => Some("javascript"), + "py" => Some("python"), + _ => None, + } +} + +/// Extract symbols from a source file's content. +pub fn extract_symbols(path: &str, content: &str) -> Vec { + let Some(lang) = detect_language(path) else { + return Vec::new(); + }; + + match lang { + "rust" => extract_rust_symbols(content), + "typescript" | "javascript" => extract_ts_symbols(content), + "python" => extract_python_symbols(content), + _ => Vec::new(), + } +} + +// ── Rust ──────────────────────────────────────────────────────────────── + +fn extract_rust_symbols(content: &str) -> Vec { + let mut parser = tree_sitter::Parser::new(); + parser.set_language(&tree_sitter_rust::LANGUAGE.into()).ok(); + + let Some(tree) = parser.parse(content, None) else { + return Vec::new(); + }; + + let mut symbols = Vec::new(); + let root = tree.root_node(); + let bytes = content.as_bytes(); + + walk_rust_node(root, bytes, content, &mut symbols); + symbols +} + +fn walk_rust_node( + node: tree_sitter::Node, + bytes: &[u8], + source: &str, + symbols: &mut Vec, +) { + match node.kind() { + "function_item" | "function_signature_item" => { + if let Some(sym) = extract_rust_function(node, bytes, source) { + symbols.push(sym); + } + } + "struct_item" => { + if let Some(sym) = extract_rust_type(node, bytes, source, "struct") { + symbols.push(sym); + } + } + "enum_item" => { + if let Some(sym) = extract_rust_type(node, bytes, source, "enum") { + symbols.push(sym); + } + } + "trait_item" => { + if let Some(sym) = extract_rust_type(node, bytes, source, "trait") { + symbols.push(sym); + } + } + "impl_item" => { + // Walk impl methods + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + if child.kind() == "declaration_list" { + walk_rust_node(child, bytes, source, symbols); + } + } + } + } + _ => { + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + walk_rust_node(child, bytes, source, symbols); + } + } + } + } +} + +fn extract_rust_function(node: tree_sitter::Node, bytes: &[u8], source: &str) -> Option { + let name = node.child_by_field_name("name")?; + let name_str = name.utf8_text(bytes).ok()?.to_string(); + + // Build signature: everything from start to the opening brace (or end if no body) + let start_byte = node.start_byte(); + let sig_end = find_rust_sig_end(node, source); + let signature = source[start_byte..sig_end].trim().to_string(); + + // Extract doc comment (line comments starting with /// before the function) + let docstring = extract_rust_doc_comment(node, source); + + Some(CodeSymbol { + name: name_str, + kind: "function".into(), + signature, + docstring, + start_line: node.start_position().row as u32 + 1, + end_line: node.end_position().row as u32 + 1, + language: "rust".into(), + }) +} + +fn extract_rust_type(node: tree_sitter::Node, bytes: &[u8], source: &str, kind: &str) -> Option { + let name = node.child_by_field_name("name")?; + let name_str = name.utf8_text(bytes).ok()?.to_string(); + + // Signature: first line of the definition + let start = node.start_byte(); + let first_line_end = source[start..].find('\n').map(|i| start + i).unwrap_or(node.end_byte()); + let signature = source[start..first_line_end].trim().to_string(); + + let docstring = extract_rust_doc_comment(node, source); + + Some(CodeSymbol { + name: name_str, + kind: kind.into(), + signature, + docstring, + start_line: node.start_position().row as u32 + 1, + end_line: node.end_position().row as u32 + 1, + language: "rust".into(), + }) +} + +fn find_rust_sig_end(node: tree_sitter::Node, source: &str) -> usize { + // Find the opening brace + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + if child.kind() == "block" || child.kind() == "field_declaration_list" + || child.kind() == "enum_variant_list" || child.kind() == "declaration_list" + { + return child.start_byte(); + } + } + } + // No body (e.g., trait method signature) + node.end_byte().min(source.len()) +} + +fn extract_rust_doc_comment(node: tree_sitter::Node, source: &str) -> String { + let start_line = node.start_position().row; + if start_line == 0 { + return String::new(); + } + + let lines: Vec<&str> = source.lines().collect(); + let mut doc_lines = Vec::new(); + + // Walk backwards from the line before the node + let mut line_idx = start_line.saturating_sub(1); + loop { + if line_idx >= lines.len() { + break; + } + let line = lines[line_idx].trim(); + if line.starts_with("///") { + doc_lines.push(line.trim_start_matches("///").trim()); + } else if line.starts_with("#[") || line.is_empty() { + // Skip attributes and blank lines between doc and function + if line.is_empty() && !doc_lines.is_empty() { + break; // blank line after doc block = stop + } + } else { + break; + } + if line_idx == 0 { + break; + } + line_idx -= 1; + } + + doc_lines.reverse(); + doc_lines.join("\n") +} + +// ── TypeScript / JavaScript ───────────────────────────────────────────── + +fn extract_ts_symbols(content: &str) -> Vec { + let mut parser = tree_sitter::Parser::new(); + parser.set_language(&tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into()).ok(); + + let Some(tree) = parser.parse(content, None) else { + return Vec::new(); + }; + + let mut symbols = Vec::new(); + walk_ts_node(tree.root_node(), content.as_bytes(), content, &mut symbols); + symbols +} + +fn walk_ts_node( + node: tree_sitter::Node, + bytes: &[u8], + source: &str, + symbols: &mut Vec, +) { + match node.kind() { + "function_declaration" | "method_definition" | "arrow_function" => { + if let Some(name) = node.child_by_field_name("name") { + let name_str = name.utf8_text(bytes).unwrap_or("").to_string(); + if !name_str.is_empty() { + let start = node.start_byte(); + let first_line_end = source[start..].find('\n').map(|i| start + i).unwrap_or(node.end_byte()); + symbols.push(CodeSymbol { + name: name_str, + kind: "function".into(), + signature: source[start..first_line_end].trim().to_string(), + docstring: String::new(), // TODO: JSDoc extraction + start_line: node.start_position().row as u32 + 1, + end_line: node.end_position().row as u32 + 1, + language: "typescript".into(), + }); + } + } + } + "class_declaration" | "interface_declaration" | "type_alias_declaration" | "enum_declaration" => { + if let Some(name) = node.child_by_field_name("name") { + let name_str = name.utf8_text(bytes).unwrap_or("").to_string(); + let kind = match node.kind() { + "class_declaration" => "class", + "interface_declaration" => "interface", + "enum_declaration" => "enum", + _ => "type", + }; + let start = node.start_byte(); + let first_line_end = source[start..].find('\n').map(|i| start + i).unwrap_or(node.end_byte()); + symbols.push(CodeSymbol { + name: name_str, + kind: kind.into(), + signature: source[start..first_line_end].trim().to_string(), + docstring: String::new(), + start_line: node.start_position().row as u32 + 1, + end_line: node.end_position().row as u32 + 1, + language: "typescript".into(), + }); + } + } + _ => {} + } + + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + walk_ts_node(child, bytes, source, symbols); + } + } +} + +// ── Python ────────────────────────────────────────────────────────────── + +fn extract_python_symbols(content: &str) -> Vec { + let mut parser = tree_sitter::Parser::new(); + parser.set_language(&tree_sitter_python::LANGUAGE.into()).ok(); + + let Some(tree) = parser.parse(content, None) else { + return Vec::new(); + }; + + let mut symbols = Vec::new(); + walk_python_node(tree.root_node(), content.as_bytes(), content, &mut symbols); + symbols +} + +fn walk_python_node( + node: tree_sitter::Node, + bytes: &[u8], + source: &str, + symbols: &mut Vec, +) { + match node.kind() { + "function_definition" => { + if let Some(name) = node.child_by_field_name("name") { + let name_str = name.utf8_text(bytes).unwrap_or("").to_string(); + let start = node.start_byte(); + let first_line_end = source[start..].find('\n').map(|i| start + i).unwrap_or(node.end_byte()); + let docstring = extract_python_docstring(node, bytes); + symbols.push(CodeSymbol { + name: name_str, + kind: "function".into(), + signature: source[start..first_line_end].trim().to_string(), + docstring, + start_line: node.start_position().row as u32 + 1, + end_line: node.end_position().row as u32 + 1, + language: "python".into(), + }); + } + } + "class_definition" => { + if let Some(name) = node.child_by_field_name("name") { + let name_str = name.utf8_text(bytes).unwrap_or("").to_string(); + let start = node.start_byte(); + let first_line_end = source[start..].find('\n').map(|i| start + i).unwrap_or(node.end_byte()); + let docstring = extract_python_docstring(node, bytes); + symbols.push(CodeSymbol { + name: name_str, + kind: "class".into(), + signature: source[start..first_line_end].trim().to_string(), + docstring, + start_line: node.start_position().row as u32 + 1, + end_line: node.end_position().row as u32 + 1, + language: "python".into(), + }); + } + } + _ => {} + } + + for i in 0..node.child_count() { + if let Some(child) = node.child(i) { + walk_python_node(child, bytes, source, symbols); + } + } +} + +fn extract_python_docstring(node: tree_sitter::Node, bytes: &[u8]) -> String { + // Python docstrings are the first expression_statement in the body + if let Some(body) = node.child_by_field_name("body") { + if let Some(first_stmt) = body.child(0) { + if first_stmt.kind() == "expression_statement" { + if let Some(expr) = first_stmt.child(0) { + if expr.kind() == "string" { + let text = expr.utf8_text(bytes).unwrap_or(""); + // Strip triple quotes + let trimmed = text + .trim_start_matches("\"\"\"") + .trim_start_matches("'''") + .trim_end_matches("\"\"\"") + .trim_end_matches("'''") + .trim(); + return trimmed.to_string(); + } + } + } + } + } + String::new() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_detect_language() { + assert_eq!(detect_language("src/main.rs"), Some("rust")); + assert_eq!(detect_language("app.ts"), Some("typescript")); + assert_eq!(detect_language("app.tsx"), Some("typescript")); + assert_eq!(detect_language("script.py"), Some("python")); + assert_eq!(detect_language("script.js"), Some("javascript")); + assert_eq!(detect_language("data.json"), None); + assert_eq!(detect_language("README.md"), None); + } + + #[test] + fn test_extract_rust_function() { + let source = r#" +/// Generate a response. +pub async fn generate(&self, req: &GenerateRequest) -> Option { + self.run_and_emit(req).await +} +"#; + let symbols = extract_rust_symbols(source); + assert!(!symbols.is_empty(), "Should extract at least one symbol"); + + let func = &symbols[0]; + assert_eq!(func.name, "generate"); + assert_eq!(func.kind, "function"); + assert!(func.signature.contains("pub async fn generate")); + assert!(func.docstring.contains("Generate a response")); + assert_eq!(func.language, "rust"); + } + + #[test] + fn test_extract_rust_struct() { + let source = r#" +/// A request to generate. +pub struct GenerateRequest { + pub text: String, + pub user_id: String, +} +"#; + let symbols = extract_rust_symbols(source); + let structs: Vec<_> = symbols.iter().filter(|s| s.kind == "struct").collect(); + assert!(!structs.is_empty()); + assert_eq!(structs[0].name, "GenerateRequest"); + assert!(structs[0].docstring.contains("request to generate")); + } + + #[test] + fn test_extract_rust_enum() { + let source = r#" +/// Whether server or client. +pub enum ToolSide { + Server, + Client, +} +"#; + let symbols = extract_rust_symbols(source); + let enums: Vec<_> = symbols.iter().filter(|s| s.kind == "enum").collect(); + assert!(!enums.is_empty()); + assert_eq!(enums[0].name, "ToolSide"); + } + + #[test] + fn test_extract_rust_trait() { + let source = r#" +pub trait Executor { + fn execute(&self, args: &str) -> String; +} +"#; + let symbols = extract_rust_symbols(source); + let traits: Vec<_> = symbols.iter().filter(|s| s.kind == "trait").collect(); + assert!(!traits.is_empty()); + assert_eq!(traits[0].name, "Executor"); + } + + #[test] + fn test_extract_rust_impl_methods() { + let source = r#" +impl Orchestrator { + /// Create new. + pub fn new(config: Config) -> Self { + Self { config } + } + + /// Subscribe to events. + pub fn subscribe(&self) -> Receiver { + self.tx.subscribe() + } +} +"#; + let symbols = extract_rust_symbols(source); + let fns: Vec<_> = symbols.iter().filter(|s| s.kind == "function").collect(); + assert!(fns.len() >= 2, "Should find impl methods, got {}", fns.len()); + let names: Vec<&str> = fns.iter().map(|s| s.name.as_str()).collect(); + assert!(names.contains(&"new")); + assert!(names.contains(&"subscribe")); + } + + #[test] + fn test_extract_ts_function() { + let source = r#" +function greet(name: string): string { + return `Hello, ${name}`; +} +"#; + let symbols = extract_ts_symbols(source); + assert!(!symbols.is_empty()); + assert_eq!(symbols[0].name, "greet"); + assert_eq!(symbols[0].kind, "function"); + } + + #[test] + fn test_extract_ts_class() { + let source = r#" +class UserService { + constructor(private db: Database) {} + + async getUser(id: string): Promise { + return this.db.find(id); + } +} +"#; + let symbols = extract_ts_symbols(source); + let classes: Vec<_> = symbols.iter().filter(|s| s.kind == "class").collect(); + assert!(!classes.is_empty()); + assert_eq!(classes[0].name, "UserService"); + } + + #[test] + fn test_extract_ts_interface() { + let source = r#" +interface User { + id: string; + name: string; + email?: string; +} +"#; + let symbols = extract_ts_symbols(source); + let ifaces: Vec<_> = symbols.iter().filter(|s| s.kind == "interface").collect(); + assert!(!ifaces.is_empty()); + assert_eq!(ifaces[0].name, "User"); + } + + #[test] + fn test_extract_python_function() { + let source = r#" +def process_data(items: list[str]) -> dict: + """Process a list of items into a dictionary.""" + return {item: len(item) for item in items} +"#; + let symbols = extract_python_symbols(source); + assert!(!symbols.is_empty()); + assert_eq!(symbols[0].name, "process_data"); + assert_eq!(symbols[0].kind, "function"); + assert!(symbols[0].docstring.contains("Process a list")); + } + + #[test] + fn test_extract_python_class() { + let source = r#" +class DataProcessor: + """Processes data from various sources.""" + + def __init__(self, config): + self.config = config + + def run(self): + pass +"#; + let symbols = extract_python_symbols(source); + let classes: Vec<_> = symbols.iter().filter(|s| s.kind == "class").collect(); + assert!(!classes.is_empty()); + assert_eq!(classes[0].name, "DataProcessor"); + assert!(classes[0].docstring.contains("Processes data")); + } + + #[test] + fn test_extract_symbols_unknown_language() { + let symbols = extract_symbols("data.json", "{}"); + assert!(symbols.is_empty()); + } + + #[test] + fn test_extract_symbols_empty_file() { + let symbols = extract_symbols("empty.rs", ""); + assert!(symbols.is_empty()); + } + + #[test] + fn test_line_numbers_are_1_based() { + let source = "fn first() {}\nfn second() {}\nfn third() {}"; + let symbols = extract_rust_symbols(source); + assert!(symbols.len() >= 3); + assert_eq!(symbols[0].start_line, 1); + assert_eq!(symbols[1].start_line, 2); + assert_eq!(symbols[2].start_line, 3); + } +} diff --git a/src/grpc/service.rs b/src/grpc/service.rs index f10a40e..c79cc41 100644 --- a/src/grpc/service.rs +++ b/src/grpc/service.rs @@ -27,6 +27,107 @@ impl CodeAgentService { impl CodeAgent for CodeAgentService { type SessionStream = Pin> + Send>>; + async fn reindex_code( + &self, + request: Request, + ) -> Result, Status> { + let req = request.into_inner(); + info!(org = req.org.as_str(), repo = req.repo.as_str(), "Reindex code request"); + + let Some(ref os) = self.state.opensearch else { + return Ok(Response::new(ReindexCodeResponse { + repos_indexed: 0, + symbols_indexed: 0, + error: "OpenSearch not configured".into(), + })); + }; + + let Some(ref gitea_config) = self.state.config.services.gitea else { + return Ok(Response::new(ReindexCodeResponse { + repos_indexed: 0, + symbols_indexed: 0, + error: "Gitea not configured".into(), + })); + }; + + // Use the GiteaClient from the tool registry (already has auth configured) + let gitea = match self.state.tools.gitea_client() { + Some(g) => g, + None => { + return Ok(Response::new(ReindexCodeResponse { + repos_indexed: 0, + symbols_indexed: 0, + error: "Gitea client not available".into(), + })); + } + }; + let admin_user = "sol"; // Sol's own Gitea identity + + let index_name = self.state.code_index_name(); + // Ensure index exists + if let Err(e) = crate::code_index::schema::create_index_if_not_exists(os, &index_name).await { + return Ok(Response::new(ReindexCodeResponse { + repos_indexed: 0, + symbols_indexed: 0, + error: format!("Failed to create index: {e}"), + })); + } + + let mut indexer = crate::code_index::indexer::CodeIndexer::new( + os.clone(), index_name, String::new(), 50, + ); + + let org = if req.org.is_empty() { None } else { Some(req.org.as_str()) }; + + if !req.repo.is_empty() { + let parts: Vec<&str> = req.repo.splitn(2, '/').collect(); + let (owner, name) = if parts.len() == 2 { + (parts[0], parts[1]) + } else { + return Ok(Response::new(ReindexCodeResponse { + repos_indexed: 0, + symbols_indexed: 0, + error: "repo must be 'owner/name' format".into(), + })); + }; + let branch = if req.branch.is_empty() { "main" } else { &req.branch }; + + match crate::code_index::gitea::index_repo( + gitea, &mut indexer, admin_user, owner, name, branch + ).await { + Ok(count) => { + indexer.flush().await; + Ok(Response::new(ReindexCodeResponse { + repos_indexed: 1, + symbols_indexed: count, + error: String::new(), + })) + } + Err(e) => Ok(Response::new(ReindexCodeResponse { + repos_indexed: 0, + symbols_indexed: 0, + error: e.to_string(), + })), + } + } else { + // Index all repos + match crate::code_index::gitea::index_all_repos( + gitea, &mut indexer, admin_user, org + ).await { + Ok(count) => Ok(Response::new(ReindexCodeResponse { + repos_indexed: 0, // TODO: count repos + symbols_indexed: count, + error: String::new(), + })), + Err(e) => Ok(Response::new(ReindexCodeResponse { + repos_indexed: 0, + symbols_indexed: 0, + error: e.to_string(), + })), + } + } + } + async fn session( &self, request: Request>, diff --git a/src/tools/mod.rs b/src/tools/mod.rs index aa5bb21..0f7ddc9 100644 --- a/src/tools/mod.rs +++ b/src/tools/mod.rs @@ -73,6 +73,10 @@ impl ToolRegistry { } } + pub fn gitea_client(&self) -> Option<&Arc> { + self.gitea.as_ref() + } + pub fn has_gitea(&self) -> bool { self.gitea.is_some() }