Add AI-accessible tools for reading structured files (CSV, JSON/JSONC, Markdown, SQL) and searching repository content (git_grep). Also adds git_blob_get to retrieve raw blob text content with binary detection. Deferred: Excel, Word, PDF, PPT modules are stubbed out due to library API incompatibilities (calamine 0.26, lopdf 0.34, quick-xml 0.37).
342 lines
11 KiB
Rust
342 lines
11 KiB
Rust
//! git_grep — search repository files for patterns.
|
|
|
|
use crate::file_tools::MAX_FILE_SIZE;
|
|
use crate::git_tools::ctx::GitToolCtx;
|
|
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
|
|
use regex::RegexBuilder;
|
|
use std::collections::HashMap;
|
|
|
|
/// Text file extensions to search (skip binary files).
|
|
const TEXT_EXTS: &[&str] = &[
|
|
"rs", "toml", "yaml", "yml", "json", "jsonc", "js", "jsx", "ts", "tsx",
|
|
"css", "scss", "less", "html", "htm", "xml", "svg", "vue", "svelte",
|
|
"py", "rb", "go", "java", "kt", "swift", "c", "cpp", "h", "hpp",
|
|
"cs", "php", "pl", "sh", "bash", "zsh", "fish", "ps1", "bat", "cmd",
|
|
"sql", "md", "markdown", "rst", "txt", "log", "ini", "cfg", "conf",
|
|
"dockerfile", "makefile", "cmake", "gradle", "properties", "env",
|
|
"proto", "graphql", "vue", "lock",
|
|
];
|
|
|
|
fn is_text_ext(path: &str) -> bool {
|
|
let lower = path.to_lowercase();
|
|
TEXT_EXTS.iter().any(|&e| lower.ends_with(&format!(".{}", e)))
|
|
}
|
|
|
|
fn is_binary_content(data: &[u8]) -> bool {
|
|
data.iter().take(8192).any(|&b| b == 0)
|
|
}
|
|
|
|
async fn git_grep_exec(
|
|
ctx: GitToolCtx,
|
|
args: serde_json::Value,
|
|
) -> Result<serde_json::Value, String> {
|
|
let p: serde_json::Map<String, serde_json::Value> =
|
|
serde_json::from_value(args).map_err(|e| e.to_string())?;
|
|
|
|
let project_name = p
|
|
.get("project_name")
|
|
.and_then(|v| v.as_str())
|
|
.ok_or("missing project_name")?;
|
|
let repo_name = p
|
|
.get("repo_name")
|
|
.and_then(|v| v.as_str())
|
|
.ok_or("missing repo_name")?;
|
|
let rev = p
|
|
.get("rev")
|
|
.and_then(|v| v.as_str())
|
|
.map(String::from)
|
|
.unwrap_or_else(|| "HEAD".to_string());
|
|
let pattern = p
|
|
.get("pattern")
|
|
.and_then(|v| v.as_str())
|
|
.ok_or("missing pattern")?;
|
|
let glob = p.get("glob").and_then(|v| v.as_str()).map(String::from);
|
|
let is_regex = p
|
|
.get("is_regex")
|
|
.and_then(|v| v.as_bool())
|
|
.unwrap_or(true);
|
|
let context_lines = p
|
|
.get("context_lines")
|
|
.and_then(|v| v.as_u64())
|
|
.unwrap_or(0) as usize;
|
|
let max_results = p
|
|
.get("max_results")
|
|
.and_then(|v| v.as_u64())
|
|
.unwrap_or(100) as usize;
|
|
|
|
let domain = ctx.open_repo(project_name, repo_name).await?;
|
|
|
|
// Resolve revision to commit oid
|
|
let commit_oid = if rev.len() >= 40 {
|
|
git::commit::types::CommitOid::new(&rev)
|
|
} else {
|
|
domain
|
|
.commit_get_prefix(&rev)
|
|
.map_err(|e| e.to_string())?
|
|
.oid
|
|
};
|
|
|
|
let regex = if is_regex {
|
|
RegexBuilder::new(pattern)
|
|
.case_insensitive(true)
|
|
.build()
|
|
.map_err(|e| format!("invalid regex '{}': {}", pattern, e))?
|
|
} else {
|
|
// Escape for literal search
|
|
RegexBuilder::new(®ex::escape(pattern))
|
|
.case_insensitive(true)
|
|
.build()
|
|
.map_err(|e| e.to_string())?
|
|
};
|
|
|
|
// Recursive tree walk using git2
|
|
let repo = domain.repo();
|
|
let commit = repo
|
|
.find_commit(commit_oid.to_oid().map_err(|e| e.to_string())?)
|
|
.map_err(|e| e.to_string())?;
|
|
let tree = commit.tree().map_err(|e| e.to_string())?;
|
|
|
|
let mut results: Vec<serde_json::Value> = Vec::new();
|
|
// Stack: (tree, current_path_prefix)
|
|
let mut stack: Vec<(git2::Tree<'_>, String)> = vec![(tree, String::new())];
|
|
|
|
while let Some((current_tree, current_prefix)) = stack.pop() {
|
|
for entry in current_tree.iter() {
|
|
let name = entry.name().unwrap_or_default();
|
|
if name.is_empty() {
|
|
continue;
|
|
}
|
|
let path: String = if current_prefix.is_empty() {
|
|
name.to_string()
|
|
} else {
|
|
format!("{}/{}", current_prefix, name)
|
|
};
|
|
|
|
if entry.kind() == Some(git2::ObjectType::Tree) {
|
|
if let Some(subtree) = entry.to_object(&repo).ok().and_then(|o| o.into_tree().ok()) {
|
|
stack.push((subtree, path));
|
|
}
|
|
continue;
|
|
}
|
|
|
|
if entry.kind() != Some(git2::ObjectType::Blob) {
|
|
continue;
|
|
}
|
|
|
|
// Glob filter
|
|
if let Some(ref g) = glob {
|
|
if !glob_match(&path, g) {
|
|
continue;
|
|
}
|
|
} else if !is_text_ext(&path) {
|
|
continue;
|
|
}
|
|
|
|
// Read blob content
|
|
let blob = match entry.to_object(&repo).ok().and_then(|o| o.into_blob().ok()) {
|
|
Some(b) => b,
|
|
None => continue,
|
|
};
|
|
|
|
let size = blob.size();
|
|
if size == 0 || size > MAX_FILE_SIZE {
|
|
continue;
|
|
}
|
|
|
|
let data = blob.content();
|
|
if is_binary_content(data) {
|
|
continue;
|
|
}
|
|
|
|
let content = match String::from_utf8(data.to_vec()) {
|
|
Ok(s) => s,
|
|
Err(_) => continue,
|
|
};
|
|
|
|
// Search line by line
|
|
let lines: Vec<&str> = content.lines().collect();
|
|
for (line_idx, line) in lines.iter().enumerate() {
|
|
if regex.is_match(line) {
|
|
let start = line_idx.saturating_sub(context_lines);
|
|
let end = (line_idx + context_lines + 1).min(lines.len());
|
|
|
|
let context: Vec<String> = lines[start..end]
|
|
.iter()
|
|
.enumerate()
|
|
.map(|(i, l)| {
|
|
let line_num = start + i + 1;
|
|
let prefix = if start + i == line_idx { ">" } else { " " };
|
|
format!("{}{}: {}", prefix, line_num, l)
|
|
})
|
|
.collect();
|
|
|
|
results.push(serde_json::json!({
|
|
"file": path,
|
|
"line_number": line_idx + 1,
|
|
"match": line,
|
|
"context": context.join("\n"),
|
|
}));
|
|
|
|
if results.len() >= max_results {
|
|
return Ok(serde_json::json!({
|
|
"query": pattern,
|
|
"rev": rev,
|
|
"total_matches": results.len(),
|
|
"truncated": true,
|
|
"results": results
|
|
}));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(serde_json::json!({
|
|
"query": pattern,
|
|
"rev": rev,
|
|
"total_matches": results.len(),
|
|
"truncated": false,
|
|
"results": results
|
|
}))
|
|
}
|
|
|
|
fn glob_match(path: &str, pattern: &str) -> bool {
|
|
// Simple glob: support *, ?, **
|
|
let parts: Vec<&str> = pattern.split('/').collect();
|
|
let path_parts: Vec<&str> = path.split('/').collect();
|
|
let _path_lower = path.to_lowercase();
|
|
let pattern_lower = pattern.to_lowercase();
|
|
|
|
fn matches_part(path_part: &str, pattern_part: &str) -> bool {
|
|
if pattern_part.is_empty() || pattern_part == "*" {
|
|
return true;
|
|
}
|
|
if pattern_part == "**" {
|
|
return true;
|
|
}
|
|
if let Some(star) = pattern_part.find('*') {
|
|
let (prefix, suffix) = pattern_part.split_at(star);
|
|
let suffix = if suffix.starts_with('*') { &suffix[1..] } else { suffix };
|
|
if !prefix.is_empty() && !path_part.starts_with(prefix) {
|
|
return false;
|
|
}
|
|
if !suffix.is_empty() && !path_part.ends_with(suffix) {
|
|
return false;
|
|
}
|
|
return true;
|
|
}
|
|
path_part == pattern_part
|
|
}
|
|
|
|
if parts.len() == 1 {
|
|
// Simple glob pattern on filename only
|
|
let file_name = path_parts.last().unwrap_or(&"");
|
|
return matches_part(file_name, &pattern_lower);
|
|
}
|
|
|
|
// Multi-part glob
|
|
let mut pi = 0;
|
|
for part in &parts {
|
|
while pi < path_parts.len() {
|
|
if matches_part(path_parts[pi], part) {
|
|
pi += 1;
|
|
break;
|
|
}
|
|
if *part != "**" {
|
|
return false;
|
|
}
|
|
pi += 1;
|
|
}
|
|
}
|
|
true
|
|
}
|
|
|
|
pub fn register_grep_tools(registry: &mut ToolRegistry) {
|
|
let p = HashMap::from([
|
|
("project_name".into(), ToolParam {
|
|
name: "project_name".into(),
|
|
param_type: "string".into(),
|
|
description: Some("Project name (slug)".into()),
|
|
required: true,
|
|
properties: None,
|
|
items: None,
|
|
}),
|
|
("repo_name".into(), ToolParam {
|
|
name: "repo_name".into(),
|
|
param_type: "string".into(),
|
|
description: Some("Repository name".into()),
|
|
required: true,
|
|
properties: None,
|
|
items: None,
|
|
}),
|
|
("pattern".into(), ToolParam {
|
|
name: "pattern".into(),
|
|
param_type: "string".into(),
|
|
description: Some("Search pattern (regex or literal string)".into()),
|
|
required: true,
|
|
properties: None,
|
|
items: None,
|
|
}),
|
|
("rev".into(), ToolParam {
|
|
name: "rev".into(),
|
|
param_type: "string".into(),
|
|
description: Some("Git revision to search in (branch, tag, commit). Default: HEAD".into()),
|
|
required: false,
|
|
properties: None,
|
|
items: None,
|
|
}),
|
|
("glob".into(), ToolParam {
|
|
name: "glob".into(),
|
|
param_type: "string".into(),
|
|
description: Some("File glob pattern to filter (e.g. *.rs, src/**/*.ts)".into()),
|
|
required: false,
|
|
properties: None,
|
|
items: None,
|
|
}),
|
|
("is_regex".into(), ToolParam {
|
|
name: "is_regex".into(),
|
|
param_type: "boolean".into(),
|
|
description: Some("If true, pattern is a regex. If false, literal string. Default: true".into()),
|
|
required: false,
|
|
properties: None,
|
|
items: None,
|
|
}),
|
|
("context_lines".into(), ToolParam {
|
|
name: "context_lines".into(),
|
|
param_type: "integer".into(),
|
|
description: Some("Number of surrounding lines to include for each match. Default: 0".into()),
|
|
required: false,
|
|
properties: None,
|
|
items: None,
|
|
}),
|
|
("max_results".into(), ToolParam {
|
|
name: "max_results".into(),
|
|
param_type: "integer".into(),
|
|
description: Some("Maximum number of matches to return. Default: 100".into()),
|
|
required: false,
|
|
properties: None,
|
|
items: None,
|
|
}),
|
|
]);
|
|
|
|
let schema = ToolSchema {
|
|
schema_type: "object".into(),
|
|
properties: Some(p),
|
|
required: Some(vec!["project_name".into(), "repo_name".into(), "pattern".into()]),
|
|
};
|
|
|
|
registry.register(
|
|
ToolDefinition::new("git_grep")
|
|
.description("Search for a text pattern across all files in a repository at a given revision. Supports regex, glob filtering, and line-level context. Skips binary files automatically.")
|
|
.parameters(schema),
|
|
ToolHandler::new(|ctx, args| {
|
|
let gctx = GitToolCtx::new(ctx);
|
|
Box::pin(async move {
|
|
git_grep_exec(gctx, args)
|
|
.await
|
|
.map_err(agent::ToolError::ExecutionError)
|
|
})
|
|
}),
|
|
);
|
|
}
|