gitdataai/libs/service/file_tools/grep.rs
ZhenYi 1af796ac75 feat(service): add file_tools module and git_blob_get tool
Add AI-accessible tools for reading structured files (CSV, JSON/JSONC,
Markdown, SQL) and searching repository content (git_grep). Also adds
git_blob_get to retrieve raw blob text content with binary detection.

Deferred: Excel, Word, PDF, PPT modules are stubbed out due to library
API incompatibilities (calamine 0.26, lopdf 0.34, quick-xml 0.37).
2026-04-18 23:02:10 +08:00

342 lines
11 KiB
Rust

//! git_grep — search repository files for patterns.
use crate::file_tools::MAX_FILE_SIZE;
use crate::git_tools::ctx::GitToolCtx;
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
use regex::RegexBuilder;
use std::collections::HashMap;
/// Text file extensions to search (skip binary files).
const TEXT_EXTS: &[&str] = &[
"rs", "toml", "yaml", "yml", "json", "jsonc", "js", "jsx", "ts", "tsx",
"css", "scss", "less", "html", "htm", "xml", "svg", "vue", "svelte",
"py", "rb", "go", "java", "kt", "swift", "c", "cpp", "h", "hpp",
"cs", "php", "pl", "sh", "bash", "zsh", "fish", "ps1", "bat", "cmd",
"sql", "md", "markdown", "rst", "txt", "log", "ini", "cfg", "conf",
"dockerfile", "makefile", "cmake", "gradle", "properties", "env",
"proto", "graphql", "vue", "lock",
];
fn is_text_ext(path: &str) -> bool {
let lower = path.to_lowercase();
TEXT_EXTS.iter().any(|&e| lower.ends_with(&format!(".{}", e)))
}
fn is_binary_content(data: &[u8]) -> bool {
data.iter().take(8192).any(|&b| b == 0)
}
async fn git_grep_exec(
ctx: GitToolCtx,
args: serde_json::Value,
) -> Result<serde_json::Value, String> {
let p: serde_json::Map<String, serde_json::Value> =
serde_json::from_value(args).map_err(|e| e.to_string())?;
let project_name = p
.get("project_name")
.and_then(|v| v.as_str())
.ok_or("missing project_name")?;
let repo_name = p
.get("repo_name")
.and_then(|v| v.as_str())
.ok_or("missing repo_name")?;
let rev = p
.get("rev")
.and_then(|v| v.as_str())
.map(String::from)
.unwrap_or_else(|| "HEAD".to_string());
let pattern = p
.get("pattern")
.and_then(|v| v.as_str())
.ok_or("missing pattern")?;
let glob = p.get("glob").and_then(|v| v.as_str()).map(String::from);
let is_regex = p
.get("is_regex")
.and_then(|v| v.as_bool())
.unwrap_or(true);
let context_lines = p
.get("context_lines")
.and_then(|v| v.as_u64())
.unwrap_or(0) as usize;
let max_results = p
.get("max_results")
.and_then(|v| v.as_u64())
.unwrap_or(100) as usize;
let domain = ctx.open_repo(project_name, repo_name).await?;
// Resolve revision to commit oid
let commit_oid = if rev.len() >= 40 {
git::commit::types::CommitOid::new(&rev)
} else {
domain
.commit_get_prefix(&rev)
.map_err(|e| e.to_string())?
.oid
};
let regex = if is_regex {
RegexBuilder::new(pattern)
.case_insensitive(true)
.build()
.map_err(|e| format!("invalid regex '{}': {}", pattern, e))?
} else {
// Escape for literal search
RegexBuilder::new(&regex::escape(pattern))
.case_insensitive(true)
.build()
.map_err(|e| e.to_string())?
};
// Recursive tree walk using git2
let repo = domain.repo();
let commit = repo
.find_commit(commit_oid.to_oid().map_err(|e| e.to_string())?)
.map_err(|e| e.to_string())?;
let tree = commit.tree().map_err(|e| e.to_string())?;
let mut results: Vec<serde_json::Value> = Vec::new();
// Stack: (tree, current_path_prefix)
let mut stack: Vec<(git2::Tree<'_>, String)> = vec![(tree, String::new())];
while let Some((current_tree, current_prefix)) = stack.pop() {
for entry in current_tree.iter() {
let name = entry.name().unwrap_or_default();
if name.is_empty() {
continue;
}
let path: String = if current_prefix.is_empty() {
name.to_string()
} else {
format!("{}/{}", current_prefix, name)
};
if entry.kind() == Some(git2::ObjectType::Tree) {
if let Some(subtree) = entry.to_object(&repo).ok().and_then(|o| o.into_tree().ok()) {
stack.push((subtree, path));
}
continue;
}
if entry.kind() != Some(git2::ObjectType::Blob) {
continue;
}
// Glob filter
if let Some(ref g) = glob {
if !glob_match(&path, g) {
continue;
}
} else if !is_text_ext(&path) {
continue;
}
// Read blob content
let blob = match entry.to_object(&repo).ok().and_then(|o| o.into_blob().ok()) {
Some(b) => b,
None => continue,
};
let size = blob.size();
if size == 0 || size > MAX_FILE_SIZE {
continue;
}
let data = blob.content();
if is_binary_content(data) {
continue;
}
let content = match String::from_utf8(data.to_vec()) {
Ok(s) => s,
Err(_) => continue,
};
// Search line by line
let lines: Vec<&str> = content.lines().collect();
for (line_idx, line) in lines.iter().enumerate() {
if regex.is_match(line) {
let start = line_idx.saturating_sub(context_lines);
let end = (line_idx + context_lines + 1).min(lines.len());
let context: Vec<String> = lines[start..end]
.iter()
.enumerate()
.map(|(i, l)| {
let line_num = start + i + 1;
let prefix = if start + i == line_idx { ">" } else { " " };
format!("{}{}: {}", prefix, line_num, l)
})
.collect();
results.push(serde_json::json!({
"file": path,
"line_number": line_idx + 1,
"match": line,
"context": context.join("\n"),
}));
if results.len() >= max_results {
return Ok(serde_json::json!({
"query": pattern,
"rev": rev,
"total_matches": results.len(),
"truncated": true,
"results": results
}));
}
}
}
}
}
Ok(serde_json::json!({
"query": pattern,
"rev": rev,
"total_matches": results.len(),
"truncated": false,
"results": results
}))
}
fn glob_match(path: &str, pattern: &str) -> bool {
// Simple glob: support *, ?, **
let parts: Vec<&str> = pattern.split('/').collect();
let path_parts: Vec<&str> = path.split('/').collect();
let _path_lower = path.to_lowercase();
let pattern_lower = pattern.to_lowercase();
fn matches_part(path_part: &str, pattern_part: &str) -> bool {
if pattern_part.is_empty() || pattern_part == "*" {
return true;
}
if pattern_part == "**" {
return true;
}
if let Some(star) = pattern_part.find('*') {
let (prefix, suffix) = pattern_part.split_at(star);
let suffix = if suffix.starts_with('*') { &suffix[1..] } else { suffix };
if !prefix.is_empty() && !path_part.starts_with(prefix) {
return false;
}
if !suffix.is_empty() && !path_part.ends_with(suffix) {
return false;
}
return true;
}
path_part == pattern_part
}
if parts.len() == 1 {
// Simple glob pattern on filename only
let file_name = path_parts.last().unwrap_or(&"");
return matches_part(file_name, &pattern_lower);
}
// Multi-part glob
let mut pi = 0;
for part in &parts {
while pi < path_parts.len() {
if matches_part(path_parts[pi], part) {
pi += 1;
break;
}
if *part != "**" {
return false;
}
pi += 1;
}
}
true
}
pub fn register_grep_tools(registry: &mut ToolRegistry) {
let p = HashMap::from([
("project_name".into(), ToolParam {
name: "project_name".into(),
param_type: "string".into(),
description: Some("Project name (slug)".into()),
required: true,
properties: None,
items: None,
}),
("repo_name".into(), ToolParam {
name: "repo_name".into(),
param_type: "string".into(),
description: Some("Repository name".into()),
required: true,
properties: None,
items: None,
}),
("pattern".into(), ToolParam {
name: "pattern".into(),
param_type: "string".into(),
description: Some("Search pattern (regex or literal string)".into()),
required: true,
properties: None,
items: None,
}),
("rev".into(), ToolParam {
name: "rev".into(),
param_type: "string".into(),
description: Some("Git revision to search in (branch, tag, commit). Default: HEAD".into()),
required: false,
properties: None,
items: None,
}),
("glob".into(), ToolParam {
name: "glob".into(),
param_type: "string".into(),
description: Some("File glob pattern to filter (e.g. *.rs, src/**/*.ts)".into()),
required: false,
properties: None,
items: None,
}),
("is_regex".into(), ToolParam {
name: "is_regex".into(),
param_type: "boolean".into(),
description: Some("If true, pattern is a regex. If false, literal string. Default: true".into()),
required: false,
properties: None,
items: None,
}),
("context_lines".into(), ToolParam {
name: "context_lines".into(),
param_type: "integer".into(),
description: Some("Number of surrounding lines to include for each match. Default: 0".into()),
required: false,
properties: None,
items: None,
}),
("max_results".into(), ToolParam {
name: "max_results".into(),
param_type: "integer".into(),
description: Some("Maximum number of matches to return. Default: 100".into()),
required: false,
properties: None,
items: None,
}),
]);
let schema = ToolSchema {
schema_type: "object".into(),
properties: Some(p),
required: Some(vec!["project_name".into(), "repo_name".into(), "pattern".into()]),
};
registry.register(
ToolDefinition::new("git_grep")
.description("Search for a text pattern across all files in a repository at a given revision. Supports regex, glob filtering, and line-level context. Skips binary files automatically.")
.parameters(schema),
ToolHandler::new(|ctx, args| {
let gctx = GitToolCtx::new(ctx);
Box::pin(async move {
git_grep_exec(gctx, args)
.await
.map_err(agent::ToolError::ExecutionError)
})
}),
);
}