feat: Gitea repo indexing via gRPC ReindexCode endpoint
Gitea indexer (code_index/gitea.rs): - Walks repos via GiteaClient API (list repos → traverse dirs → fetch files) - Base64 decodes file content from Gitea API responses - Extracts symbols with tree-sitter (Rust, TypeScript, Python) - Indexes to sol_code OpenSearch index with repo/branch/source metadata - Skips hidden dirs, vendor, node_modules, files >100KB - delete_branch() for clean re-indexing Server-side tree-sitter (code_index/symbols.rs): - Full symbol extraction shared with CLI client - extract_symbols(), extract_project_symbols(), detect_language() gRPC ReindexCode RPC: - ReindexCodeRequest: org, repo, branch (all optional filters) - ReindexCodeResponse: repos_indexed, symbols_indexed, error - Uses ToolRegistry's GiteaClient (already authenticated) - Creates sol_code index if not exists ToolRegistry.gitea_client() accessor for reindex endpoint.
This commit is contained in:
70
Cargo.lock
generated
70
Cargo.lock
generated
@@ -1357,7 +1357,7 @@ dependencies = [
|
||||
"libc",
|
||||
"option-ext",
|
||||
"redox_users",
|
||||
"windows-sys 0.59.0",
|
||||
"windows-sys 0.61.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -1371,6 +1371,12 @@ dependencies = [
|
||||
"syn",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "dotenv"
|
||||
version = "0.15.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "77c90badedccf4105eca100756a0b1289e191f6fcbdadd3cee1d2f614f97da8f"
|
||||
|
||||
[[package]]
|
||||
name = "dprint-swc-ext"
|
||||
version = "0.26.0"
|
||||
@@ -3771,7 +3777,7 @@ dependencies = [
|
||||
"once_cell",
|
||||
"socket2",
|
||||
"tracing",
|
||||
"windows-sys 0.59.0",
|
||||
"windows-sys 0.60.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@@ -4705,6 +4711,7 @@ dependencies = [
|
||||
"deno_ast",
|
||||
"deno_core",
|
||||
"deno_error",
|
||||
"dotenv",
|
||||
"futures",
|
||||
"jsonwebtoken",
|
||||
"libsqlite3-sys",
|
||||
@@ -4730,6 +4737,10 @@ dependencies = [
|
||||
"tonic-prost-build",
|
||||
"tracing",
|
||||
"tracing-subscriber",
|
||||
"tree-sitter",
|
||||
"tree-sitter-python",
|
||||
"tree-sitter-rust",
|
||||
"tree-sitter-typescript",
|
||||
"url",
|
||||
"uuid",
|
||||
]
|
||||
@@ -4808,6 +4819,12 @@ dependencies = [
|
||||
"unicode-ident",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "streaming-iterator"
|
||||
version = "0.1.9"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520"
|
||||
|
||||
[[package]]
|
||||
name = "string_enum"
|
||||
version = "1.0.2"
|
||||
@@ -5871,6 +5888,55 @@ dependencies = [
|
||||
"tracing-log",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tree-sitter"
|
||||
version = "0.24.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a5387dffa7ffc7d2dae12b50c6f7aab8ff79d6210147c6613561fc3d474c6f75"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"regex",
|
||||
"regex-syntax",
|
||||
"streaming-iterator",
|
||||
"tree-sitter-language",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tree-sitter-language"
|
||||
version = "0.1.7"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "009994f150cc0cd50ff54917d5bc8bffe8cad10ca10d81c34da2ec421ae61782"
|
||||
|
||||
[[package]]
|
||||
name = "tree-sitter-python"
|
||||
version = "0.23.6"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3d065aaa27f3aaceaf60c1f0e0ac09e1cb9eb8ed28e7bcdaa52129cffc7f4b04"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"tree-sitter-language",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tree-sitter-rust"
|
||||
version = "0.23.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ca8ccb3e3a3495c8a943f6c3fd24c3804c471fd7f4f16087623c7fa4c0068e8a"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"tree-sitter-language",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tree-sitter-typescript"
|
||||
version = "0.23.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6c5f76ed8d947a75cc446d5fccd8b602ebf0cde64ccf2ffa434d873d7a575eff"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"tree-sitter-language",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "triomphe"
|
||||
version = "0.1.15"
|
||||
|
||||
@@ -44,6 +44,10 @@ prost = "0.14"
|
||||
tokio-stream = "0.1"
|
||||
jsonwebtoken = "9"
|
||||
tokenizers = { version = "0.22", default-features = false, features = ["onig", "http"] }
|
||||
tree-sitter = "0.24"
|
||||
tree-sitter-rust = "0.23"
|
||||
tree-sitter-typescript = "0.23"
|
||||
tree-sitter-python = "0.23"
|
||||
|
||||
[dev-dependencies]
|
||||
dotenv = "0.15"
|
||||
|
||||
230
src/code_index/gitea.rs
Normal file
230
src/code_index/gitea.rs
Normal file
@@ -0,0 +1,230 @@
|
||||
//! Gitea repo indexer — walks repos via the Gitea API, extracts symbols
|
||||
//! with tree-sitter, and indexes them to OpenSearch.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use tracing::{debug, error, info, warn};
|
||||
|
||||
use super::indexer::CodeIndexer;
|
||||
use super::schema::SymbolDocument;
|
||||
use super::symbols;
|
||||
use crate::sdk::gitea::GiteaClient;
|
||||
|
||||
/// Index all repos for an organization (or all accessible repos).
|
||||
pub async fn index_all_repos(
|
||||
gitea: &GiteaClient,
|
||||
indexer: &mut CodeIndexer,
|
||||
admin_localpart: &str,
|
||||
org: Option<&str>,
|
||||
) -> anyhow::Result<u32> {
|
||||
let repos = gitea
|
||||
.list_repos(admin_localpart, None, org, Some(100))
|
||||
.await
|
||||
.map_err(|e| anyhow::anyhow!("Failed to list repos: {e}"))?;
|
||||
|
||||
let mut total_symbols = 0u32;
|
||||
|
||||
for repo in &repos {
|
||||
// full_name is "owner/name"
|
||||
let parts: Vec<&str> = repo.full_name.splitn(2, '/').collect();
|
||||
if parts.len() != 2 {
|
||||
warn!(full_name = repo.full_name.as_str(), "Invalid repo full_name");
|
||||
continue;
|
||||
}
|
||||
let owner = parts[0];
|
||||
let name = parts[1];
|
||||
|
||||
// Get full repo details for default_branch
|
||||
let full_repo = match gitea.get_repo(admin_localpart, owner, name).await {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
warn!(owner, name, "Failed to get repo details: {e}");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let default_branch = &full_repo.default_branch;
|
||||
|
||||
info!(owner, name, branch = default_branch.as_str(), "Indexing repo");
|
||||
|
||||
match index_repo(gitea, indexer, admin_localpart, owner, name, &default_branch).await {
|
||||
Ok(count) => {
|
||||
total_symbols += count;
|
||||
info!(owner, name, count, "Indexed repo symbols");
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(owner, name, "Failed to index repo: {e}");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
indexer.flush().await;
|
||||
Ok(total_symbols)
|
||||
}
|
||||
|
||||
/// Index a single repo at a given branch.
|
||||
pub async fn index_repo(
|
||||
gitea: &GiteaClient,
|
||||
indexer: &mut CodeIndexer,
|
||||
localpart: &str,
|
||||
owner: &str,
|
||||
repo: &str,
|
||||
branch: &str,
|
||||
) -> anyhow::Result<u32> {
|
||||
// Delete existing symbols for this repo+branch before re-indexing
|
||||
indexer.delete_branch(repo, branch).await;
|
||||
|
||||
let mut count = 0u32;
|
||||
let mut dirs_to_visit = vec![String::new()]; // start at repo root
|
||||
|
||||
while let Some(dir_path) = dirs_to_visit.pop() {
|
||||
let entries = match gitea
|
||||
.get_file(localpart, owner, repo, &dir_path, Some(branch))
|
||||
.await
|
||||
{
|
||||
Ok(content) => content,
|
||||
Err(e) => {
|
||||
debug!(owner, repo, path = dir_path.as_str(), "Failed to list directory: {e}");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
// get_file returns a JSON string — parse as array of entries
|
||||
let entries_json: serde_json::Value =
|
||||
serde_json::from_str(&serde_json::to_string(&entries).unwrap_or_default())
|
||||
.unwrap_or_default();
|
||||
|
||||
// If it's a single file response (not a directory listing), skip
|
||||
if !entries_json.is_array() {
|
||||
continue;
|
||||
}
|
||||
|
||||
let items = entries_json.as_array().unwrap();
|
||||
for item in items {
|
||||
let name = item["name"].as_str().unwrap_or("");
|
||||
let path = item["path"].as_str().unwrap_or("");
|
||||
let file_type = item["type"].as_str().unwrap_or("");
|
||||
|
||||
// Skip hidden, vendor, build dirs
|
||||
if name.starts_with('.')
|
||||
|| name == "target"
|
||||
|| name == "vendor"
|
||||
|| name == "node_modules"
|
||||
|| name == "dist"
|
||||
|| name == "__pycache__"
|
||||
|| name == ".git"
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if file_type == "dir" {
|
||||
dirs_to_visit.push(path.to_string());
|
||||
} else if file_type == "file" {
|
||||
// Check if it's a supported source file
|
||||
let lang = symbols::detect_language(path);
|
||||
if lang.is_none() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Skip large files
|
||||
let size = item["size"].as_u64().unwrap_or(0);
|
||||
if size > 100_000 {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Fetch file content
|
||||
let content = match fetch_file_content(gitea, localpart, owner, repo, path, branch).await {
|
||||
Some(c) => c,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
// Extract symbols
|
||||
let syms = symbols::extract_symbols(path, &content);
|
||||
let now = chrono::Utc::now().timestamp_millis();
|
||||
|
||||
for sym in syms {
|
||||
// Build content snippet for embedding
|
||||
let body = extract_body(&content, sym.start_line, sym.end_line);
|
||||
|
||||
indexer
|
||||
.add(SymbolDocument {
|
||||
file_path: path.to_string(),
|
||||
repo_owner: Some(owner.to_string()),
|
||||
repo_name: repo.to_string(),
|
||||
language: sym.language,
|
||||
symbol_name: sym.name,
|
||||
symbol_kind: sym.kind,
|
||||
signature: sym.signature,
|
||||
docstring: sym.docstring,
|
||||
start_line: sym.start_line,
|
||||
end_line: sym.end_line,
|
||||
content: body,
|
||||
branch: branch.to_string(),
|
||||
source: "gitea".into(),
|
||||
indexed_at: now,
|
||||
})
|
||||
.await;
|
||||
count += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(count)
|
||||
}
|
||||
|
||||
/// Fetch and decode a file's content from Gitea (base64-encoded API response).
|
||||
async fn fetch_file_content(
|
||||
gitea: &GiteaClient,
|
||||
localpart: &str,
|
||||
owner: &str,
|
||||
repo: &str,
|
||||
path: &str,
|
||||
branch: &str,
|
||||
) -> Option<String> {
|
||||
let response = gitea
|
||||
.get_file(localpart, owner, repo, path, Some(branch))
|
||||
.await
|
||||
.ok()?;
|
||||
|
||||
// The response is a JSON string — parse it
|
||||
let json_str = serde_json::to_string(&response).ok()?;
|
||||
let json: serde_json::Value = serde_json::from_str(&json_str).ok()?;
|
||||
|
||||
// Content is base64-encoded
|
||||
let encoded = json["content"].as_str()?;
|
||||
let cleaned = encoded.replace('\n', ""); // Gitea adds newlines in base64
|
||||
let decoded = base64::Engine::decode(&base64::engine::general_purpose::STANDARD, &cleaned).ok()?;
|
||||
String::from_utf8(decoded).ok()
|
||||
}
|
||||
|
||||
/// Extract the body of a symbol from source content.
|
||||
fn extract_body(content: &str, start_line: u32, end_line: u32) -> String {
|
||||
let lines: Vec<&str> = content.lines().collect();
|
||||
let start = (start_line as usize).saturating_sub(1);
|
||||
let end = (end_line as usize).min(lines.len());
|
||||
let body = lines[start..end].join("\n");
|
||||
if body.len() > 500 {
|
||||
format!("{}…", &body[..497])
|
||||
} else {
|
||||
body
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_extract_body() {
|
||||
let content = "line 1\nline 2\nline 3\nline 4\nline 5";
|
||||
assert_eq!(extract_body(content, 2, 4), "line 2\nline 3\nline 4");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_body_truncation() {
|
||||
let long_content: String = (0..100).map(|i| format!("line {i} with some content to make it longer")).collect::<Vec<_>>().join("\n");
|
||||
let body = extract_body(&long_content, 1, 100);
|
||||
assert!(body.len() <= 501);
|
||||
assert!(body.ends_with('…'));
|
||||
}
|
||||
}
|
||||
@@ -3,5 +3,7 @@
|
||||
//! Indexes symbols (functions, structs, enums, traits) with their signatures,
|
||||
//! docstrings, and body content. Supports branch-aware semantic search.
|
||||
|
||||
pub mod schema;
|
||||
pub mod gitea;
|
||||
pub mod indexer;
|
||||
pub mod schema;
|
||||
pub mod symbols;
|
||||
|
||||
659
src/code_index/symbols.rs
Normal file
659
src/code_index/symbols.rs
Normal file
@@ -0,0 +1,659 @@
|
||||
//! Symbol extraction from source code using tree-sitter.
|
||||
//!
|
||||
//! Extracts function signatures, struct/enum/trait definitions, and
|
||||
//! docstrings from Rust, TypeScript, and Python files. These symbols
|
||||
//! are sent to Sol for indexing in the code search index.
|
||||
|
||||
use std::path::Path;
|
||||
use tracing::debug;
|
||||
|
||||
/// An extracted code symbol with file context.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct ProjectSymbol {
|
||||
pub file_path: String, // relative to project root
|
||||
pub name: String,
|
||||
pub kind: String,
|
||||
pub signature: String,
|
||||
pub docstring: String,
|
||||
pub start_line: u32,
|
||||
pub end_line: u32,
|
||||
pub language: String,
|
||||
pub content: String,
|
||||
}
|
||||
|
||||
/// Extract symbols from all source files in a project.
|
||||
pub fn extract_project_symbols(project_root: &str) -> Vec<ProjectSymbol> {
|
||||
let root = Path::new(project_root);
|
||||
let mut symbols = Vec::new();
|
||||
|
||||
walk_directory(root, root, &mut symbols);
|
||||
debug!(count = symbols.len(), "Extracted project symbols");
|
||||
symbols
|
||||
}
|
||||
|
||||
fn walk_directory(dir: &Path, root: &Path, symbols: &mut Vec<ProjectSymbol>) {
|
||||
let Ok(entries) = std::fs::read_dir(dir) else { return };
|
||||
|
||||
for entry in entries.flatten() {
|
||||
let path = entry.path();
|
||||
let name = entry.file_name().to_string_lossy().to_string();
|
||||
|
||||
// Skip hidden, vendor, target, node_modules, etc.
|
||||
if name.starts_with('.') || name == "target" || name == "vendor"
|
||||
|| name == "node_modules" || name == "dist" || name == "build"
|
||||
|| name == "__pycache__" || name == ".git"
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
||||
if path.is_dir() {
|
||||
walk_directory(&path, root, symbols);
|
||||
} else if path.is_file() {
|
||||
let path_str = path.to_string_lossy().to_string();
|
||||
if detect_language(&path_str).is_some() {
|
||||
// Read file (skip large files)
|
||||
if let Ok(content) = std::fs::read_to_string(&path) {
|
||||
if content.len() > 100_000 { continue; } // skip >100KB
|
||||
|
||||
let rel_path = path.strip_prefix(root)
|
||||
.map(|p| p.to_string_lossy().to_string())
|
||||
.unwrap_or(path_str.clone());
|
||||
|
||||
for sym in extract_symbols(&path_str, &content) {
|
||||
// Build content: signature + body up to 500 chars
|
||||
let body_start = content.lines()
|
||||
.take(sym.start_line as usize - 1)
|
||||
.map(|l| l.len() + 1)
|
||||
.sum::<usize>();
|
||||
let body_end = content.lines()
|
||||
.take(sym.end_line as usize)
|
||||
.map(|l| l.len() + 1)
|
||||
.sum::<usize>()
|
||||
.min(content.len());
|
||||
let body = &content[body_start..body_end];
|
||||
let truncated = if body.len() > 500 {
|
||||
format!("{}…", &body[..497])
|
||||
} else {
|
||||
body.to_string()
|
||||
};
|
||||
|
||||
symbols.push(ProjectSymbol {
|
||||
file_path: rel_path.clone(),
|
||||
name: sym.name,
|
||||
kind: sym.kind,
|
||||
signature: sym.signature,
|
||||
docstring: sym.docstring,
|
||||
start_line: sym.start_line,
|
||||
end_line: sym.end_line,
|
||||
language: sym.language,
|
||||
content: truncated,
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// An extracted code symbol.
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct CodeSymbol {
|
||||
pub name: String,
|
||||
pub kind: String, // "function", "struct", "enum", "trait", "class", "interface", "method"
|
||||
pub signature: String, // full signature line
|
||||
pub docstring: String, // doc comment / docstring
|
||||
pub start_line: u32, // 1-based
|
||||
pub end_line: u32, // 1-based
|
||||
pub language: String,
|
||||
}
|
||||
|
||||
/// Detect language from file extension.
|
||||
pub fn detect_language(path: &str) -> Option<&'static str> {
|
||||
let ext = Path::new(path).extension()?.to_str()?;
|
||||
match ext {
|
||||
"rs" => Some("rust"),
|
||||
"ts" | "tsx" => Some("typescript"),
|
||||
"js" | "jsx" => Some("javascript"),
|
||||
"py" => Some("python"),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract symbols from a source file's content.
|
||||
pub fn extract_symbols(path: &str, content: &str) -> Vec<CodeSymbol> {
|
||||
let Some(lang) = detect_language(path) else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
match lang {
|
||||
"rust" => extract_rust_symbols(content),
|
||||
"typescript" | "javascript" => extract_ts_symbols(content),
|
||||
"python" => extract_python_symbols(content),
|
||||
_ => Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
// ── Rust ────────────────────────────────────────────────────────────────
|
||||
|
||||
fn extract_rust_symbols(content: &str) -> Vec<CodeSymbol> {
|
||||
let mut parser = tree_sitter::Parser::new();
|
||||
parser.set_language(&tree_sitter_rust::LANGUAGE.into()).ok();
|
||||
|
||||
let Some(tree) = parser.parse(content, None) else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
let mut symbols = Vec::new();
|
||||
let root = tree.root_node();
|
||||
let bytes = content.as_bytes();
|
||||
|
||||
walk_rust_node(root, bytes, content, &mut symbols);
|
||||
symbols
|
||||
}
|
||||
|
||||
fn walk_rust_node(
|
||||
node: tree_sitter::Node,
|
||||
bytes: &[u8],
|
||||
source: &str,
|
||||
symbols: &mut Vec<CodeSymbol>,
|
||||
) {
|
||||
match node.kind() {
|
||||
"function_item" | "function_signature_item" => {
|
||||
if let Some(sym) = extract_rust_function(node, bytes, source) {
|
||||
symbols.push(sym);
|
||||
}
|
||||
}
|
||||
"struct_item" => {
|
||||
if let Some(sym) = extract_rust_type(node, bytes, source, "struct") {
|
||||
symbols.push(sym);
|
||||
}
|
||||
}
|
||||
"enum_item" => {
|
||||
if let Some(sym) = extract_rust_type(node, bytes, source, "enum") {
|
||||
symbols.push(sym);
|
||||
}
|
||||
}
|
||||
"trait_item" => {
|
||||
if let Some(sym) = extract_rust_type(node, bytes, source, "trait") {
|
||||
symbols.push(sym);
|
||||
}
|
||||
}
|
||||
"impl_item" => {
|
||||
// Walk impl methods
|
||||
for i in 0..node.child_count() {
|
||||
if let Some(child) = node.child(i) {
|
||||
if child.kind() == "declaration_list" {
|
||||
walk_rust_node(child, bytes, source, symbols);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
for i in 0..node.child_count() {
|
||||
if let Some(child) = node.child(i) {
|
||||
walk_rust_node(child, bytes, source, symbols);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_rust_function(node: tree_sitter::Node, bytes: &[u8], source: &str) -> Option<CodeSymbol> {
|
||||
let name = node.child_by_field_name("name")?;
|
||||
let name_str = name.utf8_text(bytes).ok()?.to_string();
|
||||
|
||||
// Build signature: everything from start to the opening brace (or end if no body)
|
||||
let start_byte = node.start_byte();
|
||||
let sig_end = find_rust_sig_end(node, source);
|
||||
let signature = source[start_byte..sig_end].trim().to_string();
|
||||
|
||||
// Extract doc comment (line comments starting with /// before the function)
|
||||
let docstring = extract_rust_doc_comment(node, source);
|
||||
|
||||
Some(CodeSymbol {
|
||||
name: name_str,
|
||||
kind: "function".into(),
|
||||
signature,
|
||||
docstring,
|
||||
start_line: node.start_position().row as u32 + 1,
|
||||
end_line: node.end_position().row as u32 + 1,
|
||||
language: "rust".into(),
|
||||
})
|
||||
}
|
||||
|
||||
fn extract_rust_type(node: tree_sitter::Node, bytes: &[u8], source: &str, kind: &str) -> Option<CodeSymbol> {
|
||||
let name = node.child_by_field_name("name")?;
|
||||
let name_str = name.utf8_text(bytes).ok()?.to_string();
|
||||
|
||||
// Signature: first line of the definition
|
||||
let start = node.start_byte();
|
||||
let first_line_end = source[start..].find('\n').map(|i| start + i).unwrap_or(node.end_byte());
|
||||
let signature = source[start..first_line_end].trim().to_string();
|
||||
|
||||
let docstring = extract_rust_doc_comment(node, source);
|
||||
|
||||
Some(CodeSymbol {
|
||||
name: name_str,
|
||||
kind: kind.into(),
|
||||
signature,
|
||||
docstring,
|
||||
start_line: node.start_position().row as u32 + 1,
|
||||
end_line: node.end_position().row as u32 + 1,
|
||||
language: "rust".into(),
|
||||
})
|
||||
}
|
||||
|
||||
fn find_rust_sig_end(node: tree_sitter::Node, source: &str) -> usize {
|
||||
// Find the opening brace
|
||||
for i in 0..node.child_count() {
|
||||
if let Some(child) = node.child(i) {
|
||||
if child.kind() == "block" || child.kind() == "field_declaration_list"
|
||||
|| child.kind() == "enum_variant_list" || child.kind() == "declaration_list"
|
||||
{
|
||||
return child.start_byte();
|
||||
}
|
||||
}
|
||||
}
|
||||
// No body (e.g., trait method signature)
|
||||
node.end_byte().min(source.len())
|
||||
}
|
||||
|
||||
fn extract_rust_doc_comment(node: tree_sitter::Node, source: &str) -> String {
|
||||
let start_line = node.start_position().row;
|
||||
if start_line == 0 {
|
||||
return String::new();
|
||||
}
|
||||
|
||||
let lines: Vec<&str> = source.lines().collect();
|
||||
let mut doc_lines = Vec::new();
|
||||
|
||||
// Walk backwards from the line before the node
|
||||
let mut line_idx = start_line.saturating_sub(1);
|
||||
loop {
|
||||
if line_idx >= lines.len() {
|
||||
break;
|
||||
}
|
||||
let line = lines[line_idx].trim();
|
||||
if line.starts_with("///") {
|
||||
doc_lines.push(line.trim_start_matches("///").trim());
|
||||
} else if line.starts_with("#[") || line.is_empty() {
|
||||
// Skip attributes and blank lines between doc and function
|
||||
if line.is_empty() && !doc_lines.is_empty() {
|
||||
break; // blank line after doc block = stop
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
if line_idx == 0 {
|
||||
break;
|
||||
}
|
||||
line_idx -= 1;
|
||||
}
|
||||
|
||||
doc_lines.reverse();
|
||||
doc_lines.join("\n")
|
||||
}
|
||||
|
||||
// ── TypeScript / JavaScript ─────────────────────────────────────────────
|
||||
|
||||
fn extract_ts_symbols(content: &str) -> Vec<CodeSymbol> {
|
||||
let mut parser = tree_sitter::Parser::new();
|
||||
parser.set_language(&tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into()).ok();
|
||||
|
||||
let Some(tree) = parser.parse(content, None) else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
let mut symbols = Vec::new();
|
||||
walk_ts_node(tree.root_node(), content.as_bytes(), content, &mut symbols);
|
||||
symbols
|
||||
}
|
||||
|
||||
fn walk_ts_node(
|
||||
node: tree_sitter::Node,
|
||||
bytes: &[u8],
|
||||
source: &str,
|
||||
symbols: &mut Vec<CodeSymbol>,
|
||||
) {
|
||||
match node.kind() {
|
||||
"function_declaration" | "method_definition" | "arrow_function" => {
|
||||
if let Some(name) = node.child_by_field_name("name") {
|
||||
let name_str = name.utf8_text(bytes).unwrap_or("").to_string();
|
||||
if !name_str.is_empty() {
|
||||
let start = node.start_byte();
|
||||
let first_line_end = source[start..].find('\n').map(|i| start + i).unwrap_or(node.end_byte());
|
||||
symbols.push(CodeSymbol {
|
||||
name: name_str,
|
||||
kind: "function".into(),
|
||||
signature: source[start..first_line_end].trim().to_string(),
|
||||
docstring: String::new(), // TODO: JSDoc extraction
|
||||
start_line: node.start_position().row as u32 + 1,
|
||||
end_line: node.end_position().row as u32 + 1,
|
||||
language: "typescript".into(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
"class_declaration" | "interface_declaration" | "type_alias_declaration" | "enum_declaration" => {
|
||||
if let Some(name) = node.child_by_field_name("name") {
|
||||
let name_str = name.utf8_text(bytes).unwrap_or("").to_string();
|
||||
let kind = match node.kind() {
|
||||
"class_declaration" => "class",
|
||||
"interface_declaration" => "interface",
|
||||
"enum_declaration" => "enum",
|
||||
_ => "type",
|
||||
};
|
||||
let start = node.start_byte();
|
||||
let first_line_end = source[start..].find('\n').map(|i| start + i).unwrap_or(node.end_byte());
|
||||
symbols.push(CodeSymbol {
|
||||
name: name_str,
|
||||
kind: kind.into(),
|
||||
signature: source[start..first_line_end].trim().to_string(),
|
||||
docstring: String::new(),
|
||||
start_line: node.start_position().row as u32 + 1,
|
||||
end_line: node.end_position().row as u32 + 1,
|
||||
language: "typescript".into(),
|
||||
});
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
for i in 0..node.child_count() {
|
||||
if let Some(child) = node.child(i) {
|
||||
walk_ts_node(child, bytes, source, symbols);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ── Python ──────────────────────────────────────────────────────────────
|
||||
|
||||
fn extract_python_symbols(content: &str) -> Vec<CodeSymbol> {
|
||||
let mut parser = tree_sitter::Parser::new();
|
||||
parser.set_language(&tree_sitter_python::LANGUAGE.into()).ok();
|
||||
|
||||
let Some(tree) = parser.parse(content, None) else {
|
||||
return Vec::new();
|
||||
};
|
||||
|
||||
let mut symbols = Vec::new();
|
||||
walk_python_node(tree.root_node(), content.as_bytes(), content, &mut symbols);
|
||||
symbols
|
||||
}
|
||||
|
||||
fn walk_python_node(
|
||||
node: tree_sitter::Node,
|
||||
bytes: &[u8],
|
||||
source: &str,
|
||||
symbols: &mut Vec<CodeSymbol>,
|
||||
) {
|
||||
match node.kind() {
|
||||
"function_definition" => {
|
||||
if let Some(name) = node.child_by_field_name("name") {
|
||||
let name_str = name.utf8_text(bytes).unwrap_or("").to_string();
|
||||
let start = node.start_byte();
|
||||
let first_line_end = source[start..].find('\n').map(|i| start + i).unwrap_or(node.end_byte());
|
||||
let docstring = extract_python_docstring(node, bytes);
|
||||
symbols.push(CodeSymbol {
|
||||
name: name_str,
|
||||
kind: "function".into(),
|
||||
signature: source[start..first_line_end].trim().to_string(),
|
||||
docstring,
|
||||
start_line: node.start_position().row as u32 + 1,
|
||||
end_line: node.end_position().row as u32 + 1,
|
||||
language: "python".into(),
|
||||
});
|
||||
}
|
||||
}
|
||||
"class_definition" => {
|
||||
if let Some(name) = node.child_by_field_name("name") {
|
||||
let name_str = name.utf8_text(bytes).unwrap_or("").to_string();
|
||||
let start = node.start_byte();
|
||||
let first_line_end = source[start..].find('\n').map(|i| start + i).unwrap_or(node.end_byte());
|
||||
let docstring = extract_python_docstring(node, bytes);
|
||||
symbols.push(CodeSymbol {
|
||||
name: name_str,
|
||||
kind: "class".into(),
|
||||
signature: source[start..first_line_end].trim().to_string(),
|
||||
docstring,
|
||||
start_line: node.start_position().row as u32 + 1,
|
||||
end_line: node.end_position().row as u32 + 1,
|
||||
language: "python".into(),
|
||||
});
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
|
||||
for i in 0..node.child_count() {
|
||||
if let Some(child) = node.child(i) {
|
||||
walk_python_node(child, bytes, source, symbols);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn extract_python_docstring(node: tree_sitter::Node, bytes: &[u8]) -> String {
|
||||
// Python docstrings are the first expression_statement in the body
|
||||
if let Some(body) = node.child_by_field_name("body") {
|
||||
if let Some(first_stmt) = body.child(0) {
|
||||
if first_stmt.kind() == "expression_statement" {
|
||||
if let Some(expr) = first_stmt.child(0) {
|
||||
if expr.kind() == "string" {
|
||||
let text = expr.utf8_text(bytes).unwrap_or("");
|
||||
// Strip triple quotes
|
||||
let trimmed = text
|
||||
.trim_start_matches("\"\"\"")
|
||||
.trim_start_matches("'''")
|
||||
.trim_end_matches("\"\"\"")
|
||||
.trim_end_matches("'''")
|
||||
.trim();
|
||||
return trimmed.to_string();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
String::new()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_detect_language() {
|
||||
assert_eq!(detect_language("src/main.rs"), Some("rust"));
|
||||
assert_eq!(detect_language("app.ts"), Some("typescript"));
|
||||
assert_eq!(detect_language("app.tsx"), Some("typescript"));
|
||||
assert_eq!(detect_language("script.py"), Some("python"));
|
||||
assert_eq!(detect_language("script.js"), Some("javascript"));
|
||||
assert_eq!(detect_language("data.json"), None);
|
||||
assert_eq!(detect_language("README.md"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_rust_function() {
|
||||
let source = r#"
|
||||
/// Generate a response.
|
||||
pub async fn generate(&self, req: &GenerateRequest) -> Option<String> {
|
||||
self.run_and_emit(req).await
|
||||
}
|
||||
"#;
|
||||
let symbols = extract_rust_symbols(source);
|
||||
assert!(!symbols.is_empty(), "Should extract at least one symbol");
|
||||
|
||||
let func = &symbols[0];
|
||||
assert_eq!(func.name, "generate");
|
||||
assert_eq!(func.kind, "function");
|
||||
assert!(func.signature.contains("pub async fn generate"));
|
||||
assert!(func.docstring.contains("Generate a response"));
|
||||
assert_eq!(func.language, "rust");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_rust_struct() {
|
||||
let source = r#"
|
||||
/// A request to generate.
|
||||
pub struct GenerateRequest {
|
||||
pub text: String,
|
||||
pub user_id: String,
|
||||
}
|
||||
"#;
|
||||
let symbols = extract_rust_symbols(source);
|
||||
let structs: Vec<_> = symbols.iter().filter(|s| s.kind == "struct").collect();
|
||||
assert!(!structs.is_empty());
|
||||
assert_eq!(structs[0].name, "GenerateRequest");
|
||||
assert!(structs[0].docstring.contains("request to generate"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_rust_enum() {
|
||||
let source = r#"
|
||||
/// Whether server or client.
|
||||
pub enum ToolSide {
|
||||
Server,
|
||||
Client,
|
||||
}
|
||||
"#;
|
||||
let symbols = extract_rust_symbols(source);
|
||||
let enums: Vec<_> = symbols.iter().filter(|s| s.kind == "enum").collect();
|
||||
assert!(!enums.is_empty());
|
||||
assert_eq!(enums[0].name, "ToolSide");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_rust_trait() {
|
||||
let source = r#"
|
||||
pub trait Executor {
|
||||
fn execute(&self, args: &str) -> String;
|
||||
}
|
||||
"#;
|
||||
let symbols = extract_rust_symbols(source);
|
||||
let traits: Vec<_> = symbols.iter().filter(|s| s.kind == "trait").collect();
|
||||
assert!(!traits.is_empty());
|
||||
assert_eq!(traits[0].name, "Executor");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_rust_impl_methods() {
|
||||
let source = r#"
|
||||
impl Orchestrator {
|
||||
/// Create new.
|
||||
pub fn new(config: Config) -> Self {
|
||||
Self { config }
|
||||
}
|
||||
|
||||
/// Subscribe to events.
|
||||
pub fn subscribe(&self) -> Receiver {
|
||||
self.tx.subscribe()
|
||||
}
|
||||
}
|
||||
"#;
|
||||
let symbols = extract_rust_symbols(source);
|
||||
let fns: Vec<_> = symbols.iter().filter(|s| s.kind == "function").collect();
|
||||
assert!(fns.len() >= 2, "Should find impl methods, got {}", fns.len());
|
||||
let names: Vec<&str> = fns.iter().map(|s| s.name.as_str()).collect();
|
||||
assert!(names.contains(&"new"));
|
||||
assert!(names.contains(&"subscribe"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_ts_function() {
|
||||
let source = r#"
|
||||
function greet(name: string): string {
|
||||
return `Hello, ${name}`;
|
||||
}
|
||||
"#;
|
||||
let symbols = extract_ts_symbols(source);
|
||||
assert!(!symbols.is_empty());
|
||||
assert_eq!(symbols[0].name, "greet");
|
||||
assert_eq!(symbols[0].kind, "function");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_ts_class() {
|
||||
let source = r#"
|
||||
class UserService {
|
||||
constructor(private db: Database) {}
|
||||
|
||||
async getUser(id: string): Promise<User> {
|
||||
return this.db.find(id);
|
||||
}
|
||||
}
|
||||
"#;
|
||||
let symbols = extract_ts_symbols(source);
|
||||
let classes: Vec<_> = symbols.iter().filter(|s| s.kind == "class").collect();
|
||||
assert!(!classes.is_empty());
|
||||
assert_eq!(classes[0].name, "UserService");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_ts_interface() {
|
||||
let source = r#"
|
||||
interface User {
|
||||
id: string;
|
||||
name: string;
|
||||
email?: string;
|
||||
}
|
||||
"#;
|
||||
let symbols = extract_ts_symbols(source);
|
||||
let ifaces: Vec<_> = symbols.iter().filter(|s| s.kind == "interface").collect();
|
||||
assert!(!ifaces.is_empty());
|
||||
assert_eq!(ifaces[0].name, "User");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_python_function() {
|
||||
let source = r#"
|
||||
def process_data(items: list[str]) -> dict:
|
||||
"""Process a list of items into a dictionary."""
|
||||
return {item: len(item) for item in items}
|
||||
"#;
|
||||
let symbols = extract_python_symbols(source);
|
||||
assert!(!symbols.is_empty());
|
||||
assert_eq!(symbols[0].name, "process_data");
|
||||
assert_eq!(symbols[0].kind, "function");
|
||||
assert!(symbols[0].docstring.contains("Process a list"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_python_class() {
|
||||
let source = r#"
|
||||
class DataProcessor:
|
||||
"""Processes data from various sources."""
|
||||
|
||||
def __init__(self, config):
|
||||
self.config = config
|
||||
|
||||
def run(self):
|
||||
pass
|
||||
"#;
|
||||
let symbols = extract_python_symbols(source);
|
||||
let classes: Vec<_> = symbols.iter().filter(|s| s.kind == "class").collect();
|
||||
assert!(!classes.is_empty());
|
||||
assert_eq!(classes[0].name, "DataProcessor");
|
||||
assert!(classes[0].docstring.contains("Processes data"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_symbols_unknown_language() {
|
||||
let symbols = extract_symbols("data.json", "{}");
|
||||
assert!(symbols.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_extract_symbols_empty_file() {
|
||||
let symbols = extract_symbols("empty.rs", "");
|
||||
assert!(symbols.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_line_numbers_are_1_based() {
|
||||
let source = "fn first() {}\nfn second() {}\nfn third() {}";
|
||||
let symbols = extract_rust_symbols(source);
|
||||
assert!(symbols.len() >= 3);
|
||||
assert_eq!(symbols[0].start_line, 1);
|
||||
assert_eq!(symbols[1].start_line, 2);
|
||||
assert_eq!(symbols[2].start_line, 3);
|
||||
}
|
||||
}
|
||||
@@ -27,6 +27,107 @@ impl CodeAgentService {
|
||||
impl CodeAgent for CodeAgentService {
|
||||
type SessionStream = Pin<Box<dyn Stream<Item = Result<ServerMessage, Status>> + Send>>;
|
||||
|
||||
async fn reindex_code(
|
||||
&self,
|
||||
request: Request<ReindexCodeRequest>,
|
||||
) -> Result<Response<ReindexCodeResponse>, Status> {
|
||||
let req = request.into_inner();
|
||||
info!(org = req.org.as_str(), repo = req.repo.as_str(), "Reindex code request");
|
||||
|
||||
let Some(ref os) = self.state.opensearch else {
|
||||
return Ok(Response::new(ReindexCodeResponse {
|
||||
repos_indexed: 0,
|
||||
symbols_indexed: 0,
|
||||
error: "OpenSearch not configured".into(),
|
||||
}));
|
||||
};
|
||||
|
||||
let Some(ref gitea_config) = self.state.config.services.gitea else {
|
||||
return Ok(Response::new(ReindexCodeResponse {
|
||||
repos_indexed: 0,
|
||||
symbols_indexed: 0,
|
||||
error: "Gitea not configured".into(),
|
||||
}));
|
||||
};
|
||||
|
||||
// Use the GiteaClient from the tool registry (already has auth configured)
|
||||
let gitea = match self.state.tools.gitea_client() {
|
||||
Some(g) => g,
|
||||
None => {
|
||||
return Ok(Response::new(ReindexCodeResponse {
|
||||
repos_indexed: 0,
|
||||
symbols_indexed: 0,
|
||||
error: "Gitea client not available".into(),
|
||||
}));
|
||||
}
|
||||
};
|
||||
let admin_user = "sol"; // Sol's own Gitea identity
|
||||
|
||||
let index_name = self.state.code_index_name();
|
||||
// Ensure index exists
|
||||
if let Err(e) = crate::code_index::schema::create_index_if_not_exists(os, &index_name).await {
|
||||
return Ok(Response::new(ReindexCodeResponse {
|
||||
repos_indexed: 0,
|
||||
symbols_indexed: 0,
|
||||
error: format!("Failed to create index: {e}"),
|
||||
}));
|
||||
}
|
||||
|
||||
let mut indexer = crate::code_index::indexer::CodeIndexer::new(
|
||||
os.clone(), index_name, String::new(), 50,
|
||||
);
|
||||
|
||||
let org = if req.org.is_empty() { None } else { Some(req.org.as_str()) };
|
||||
|
||||
if !req.repo.is_empty() {
|
||||
let parts: Vec<&str> = req.repo.splitn(2, '/').collect();
|
||||
let (owner, name) = if parts.len() == 2 {
|
||||
(parts[0], parts[1])
|
||||
} else {
|
||||
return Ok(Response::new(ReindexCodeResponse {
|
||||
repos_indexed: 0,
|
||||
symbols_indexed: 0,
|
||||
error: "repo must be 'owner/name' format".into(),
|
||||
}));
|
||||
};
|
||||
let branch = if req.branch.is_empty() { "main" } else { &req.branch };
|
||||
|
||||
match crate::code_index::gitea::index_repo(
|
||||
gitea, &mut indexer, admin_user, owner, name, branch
|
||||
).await {
|
||||
Ok(count) => {
|
||||
indexer.flush().await;
|
||||
Ok(Response::new(ReindexCodeResponse {
|
||||
repos_indexed: 1,
|
||||
symbols_indexed: count,
|
||||
error: String::new(),
|
||||
}))
|
||||
}
|
||||
Err(e) => Ok(Response::new(ReindexCodeResponse {
|
||||
repos_indexed: 0,
|
||||
symbols_indexed: 0,
|
||||
error: e.to_string(),
|
||||
})),
|
||||
}
|
||||
} else {
|
||||
// Index all repos
|
||||
match crate::code_index::gitea::index_all_repos(
|
||||
gitea, &mut indexer, admin_user, org
|
||||
).await {
|
||||
Ok(count) => Ok(Response::new(ReindexCodeResponse {
|
||||
repos_indexed: 0, // TODO: count repos
|
||||
symbols_indexed: count,
|
||||
error: String::new(),
|
||||
})),
|
||||
Err(e) => Ok(Response::new(ReindexCodeResponse {
|
||||
repos_indexed: 0,
|
||||
symbols_indexed: 0,
|
||||
error: e.to_string(),
|
||||
})),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn session(
|
||||
&self,
|
||||
request: Request<Streaming<ClientMessage>>,
|
||||
|
||||
@@ -73,6 +73,10 @@ impl ToolRegistry {
|
||||
}
|
||||
}
|
||||
|
||||
pub fn gitea_client(&self) -> Option<&Arc<crate::sdk::gitea::GiteaClient>> {
|
||||
self.gitea.as_ref()
|
||||
}
|
||||
|
||||
pub fn has_gitea(&self) -> bool {
|
||||
self.gitea.is_some()
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user