feat(service): add file_tools module and git_blob_get tool

Add AI-accessible tools for reading structured files (CSV, JSON/JSONC,
Markdown, SQL) and searching repository content (git_grep). Also adds
git_blob_get to retrieve raw blob text content with binary detection.

Deferred: Excel, Word, PDF, PPT modules are stubbed out due to library
API incompatibilities (calamine 0.26, lopdf 0.34, quick-xml 0.37).
This commit is contained in:
ZhenYi 2026-04-18 23:02:10 +08:00
parent 767bb10249
commit 1af796ac75
15 changed files with 2553 additions and 4 deletions

248
Cargo.lock generated
View File

@ -614,11 +614,23 @@ dependencies = [
"num-traits", "num-traits",
] ]
[[package]]
name = "ar_archive_writer"
version = "0.5.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7eb93bbb63b9c227414f6eb3a0adfddca591a8ce1e9b60661bb08969b87e340b"
dependencies = [
"object",
]
[[package]] [[package]]
name = "arbitrary" name = "arbitrary"
version = "1.4.2" version = "1.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1"
dependencies = [
"derive_arbitrary",
]
[[package]] [[package]]
name = "arc-swap" name = "arc-swap"
@ -1357,6 +1369,21 @@ dependencies = [
"libbz2-rs-sys", "libbz2-rs-sys",
] ]
[[package]]
name = "calamine"
version = "0.26.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "138646b9af2c5d7f1804ea4bf93afc597737d2bd4f7341d67c48b03316976eb1"
dependencies = [
"byteorder",
"codepage",
"encoding_rs",
"log",
"quick-xml 0.31.0",
"serde",
"zip 2.4.2",
]
[[package]] [[package]]
name = "captcha-rs" name = "captcha-rs"
version = "0.5.0" version = "0.5.0"
@ -1509,6 +1536,15 @@ version = "0.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "de0758edba32d61d1fd9f4d69491b47604b91ee2f7e6b33de7e54ca4ebe55dc3" checksum = "de0758edba32d61d1fd9f4d69491b47604b91ee2f7e6b33de7e54ca4ebe55dc3"
[[package]]
name = "codepage"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "48f68d061bc2828ae826206326e61251aca94c1e4a5305cf52d9138639c918b4"
dependencies = [
"encoding_rs",
]
[[package]] [[package]]
name = "color_quant" name = "color_quant"
version = "1.1.0" version = "1.1.0"
@ -1799,6 +1835,27 @@ dependencies = [
"hybrid-array", "hybrid-array",
] ]
[[package]]
name = "csv"
version = "1.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938"
dependencies = [
"csv-core",
"itoa",
"ryu",
"serde_core",
]
[[package]]
name = "csv-core"
version = "0.1.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782"
dependencies = [
"memchr",
]
[[package]] [[package]]
name = "ctr" name = "ctr"
version = "0.9.2" version = "0.9.2"
@ -1989,6 +2046,17 @@ dependencies = [
"serde_core", "serde_core",
] ]
[[package]]
name = "derive_arbitrary"
version = "1.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.117",
]
[[package]] [[package]]
name = "derive_builder" name = "derive_builder"
version = "0.20.2" version = "0.20.2"
@ -2676,6 +2744,15 @@ dependencies = [
"zeroize", "zeroize",
] ]
[[package]]
name = "getopts"
version = "0.2.24"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df"
dependencies = [
"unicode-width",
]
[[package]] [[package]]
name = "getrandom" name = "getrandom"
version = "0.2.17" version = "0.2.17"
@ -2786,7 +2863,7 @@ dependencies = [
"tokio", "tokio",
"tokio-util", "tokio-util",
"uuid", "uuid",
"zip", "zip 8.4.0",
] ]
[[package]] [[package]]
@ -4203,6 +4280,26 @@ dependencies = [
"imgref", "imgref",
] ]
[[package]]
name = "lopdf"
version = "0.34.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c5c8ecfc6c72051981c0459f75ccc585e7ff67c70829560cda8e647882a9abff"
dependencies = [
"chrono",
"encoding_rs",
"flate2",
"indexmap 2.13.0",
"itoa",
"log",
"md-5",
"nom 7.1.3",
"rangemap",
"rayon",
"time",
"weezl",
]
[[package]] [[package]]
name = "lru" name = "lru"
version = "0.12.5" version = "0.12.5"
@ -4698,6 +4795,15 @@ dependencies = [
"objc2-core-foundation", "objc2-core-foundation",
] ]
[[package]]
name = "object"
version = "0.37.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe"
dependencies = [
"memchr",
]
[[package]] [[package]]
name = "once_cell" name = "once_cell"
version = "1.21.4" version = "1.21.4"
@ -5424,6 +5530,16 @@ dependencies = [
"prost", "prost",
] ]
[[package]]
name = "psm"
version = "0.1.30"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3852766467df634d74f0b2d7819bf8dc483a0eb2e3b0f50f756f9cfe8b0d18d8"
dependencies = [
"ar_archive_writer",
"cc",
]
[[package]] [[package]]
name = "ptr_meta" name = "ptr_meta"
version = "0.1.4" version = "0.1.4"
@ -5444,6 +5560,25 @@ dependencies = [
"syn 1.0.109", "syn 1.0.109",
] ]
[[package]]
name = "pulldown-cmark"
version = "0.12.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f86ba2052aebccc42cbbb3ed234b8b13ce76f75c3551a303cb2bcffcff12bb14"
dependencies = [
"bitflags",
"getopts",
"memchr",
"pulldown-cmark-escape",
"unicase",
]
[[package]]
name = "pulldown-cmark-escape"
version = "0.11.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "007d8adb5ddab6f8e3f491ac63566a7d5002cc7ed73901f72057943fa71ae1ae"
[[package]] [[package]]
name = "pxfm" name = "pxfm"
version = "0.1.28" version = "0.1.28"
@ -5505,6 +5640,25 @@ version = "2.0.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3"
[[package]]
name = "quick-xml"
version = "0.31.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
dependencies = [
"encoding_rs",
"memchr",
]
[[package]]
name = "quick-xml"
version = "0.37.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb"
dependencies = [
"memchr",
]
[[package]] [[package]]
name = "quinn" name = "quinn"
version = "0.11.9" version = "0.11.9"
@ -5679,6 +5833,12 @@ dependencies = [
"rand 0.9.2", "rand 0.9.2",
] ]
[[package]]
name = "rangemap"
version = "1.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "973443cf09a9c8656b574a866ab68dfa19f0867d0340648c7d2f6a71b8a8ea68"
[[package]] [[package]]
name = "rav1e" name = "rav1e"
version = "0.8.1" version = "0.8.1"
@ -5755,6 +5915,26 @@ dependencies = [
"crossbeam-utils", "crossbeam-utils",
] ]
[[package]]
name = "recursive"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e"
dependencies = [
"recursive-proc-macro-impl",
"stacker",
]
[[package]]
name = "recursive-proc-macro-impl"
version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b"
dependencies = [
"quote",
"syn 2.0.117",
]
[[package]] [[package]]
name = "redis" name = "redis"
version = "1.1.0" version = "1.1.0"
@ -6745,22 +6925,29 @@ dependencies = [
"async-openai", "async-openai",
"avatar", "avatar",
"base64 0.22.1", "base64 0.22.1",
"calamine",
"captcha-rs", "captcha-rs",
"chrono", "chrono",
"config", "config",
"csv",
"db", "db",
"deadpool-redis", "deadpool-redis",
"email", "email",
"flate2",
"futures", "futures",
"git", "git",
"git2", "git2",
"hex", "hex",
"hmac", "hmac",
"lopdf",
"models", "models",
"moka", "moka",
"pulldown-cmark",
"queue", "queue",
"quick-xml 0.37.5",
"rand 0.10.0", "rand 0.10.0",
"redis", "redis",
"regex",
"reqwest 0.13.2", "reqwest 0.13.2",
"room", "room",
"rsa", "rsa",
@ -6772,11 +6959,15 @@ dependencies = [
"sha1", "sha1",
"sha2 0.11.0", "sha2 0.11.0",
"slog", "slog",
"sqlparser",
"tempfile",
"tokio", "tokio",
"tokio-stream", "tokio-stream",
"tracing", "tracing",
"utoipa", "utoipa",
"uuid", "uuid",
"walkdir",
"zip 8.4.0",
] ]
[[package]] [[package]]
@ -6992,6 +7183,16 @@ dependencies = [
"der", "der",
] ]
[[package]]
name = "sqlparser"
version = "0.55.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4521174166bac1ff04fe16ef4524c70144cd29682a45978978ca3d7f4e0be11"
dependencies = [
"log",
"recursive",
]
[[package]] [[package]]
name = "sqlx" name = "sqlx"
version = "0.8.6" version = "0.8.6"
@ -7275,6 +7476,19 @@ version = "1.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
[[package]]
name = "stacker"
version = "0.1.23"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "08d74a23609d509411d10e2176dc2a4346e3b4aea2e7b1869f19fdedbc71c013"
dependencies = [
"cc",
"cfg-if",
"libc",
"psm",
"windows-sys 0.59.0",
]
[[package]] [[package]]
name = "static-server" name = "static-server"
version = "0.2.9" version = "0.2.9"
@ -7936,6 +8150,12 @@ version = "1.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c" checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c"
[[package]]
name = "unicode-width"
version = "0.2.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
[[package]] [[package]]
name = "unicode-xid" name = "unicode-xid"
version = "0.2.6" version = "0.2.6"
@ -8504,6 +8724,15 @@ dependencies = [
"windows-targets 0.52.6", "windows-targets 0.52.6",
] ]
[[package]]
name = "windows-sys"
version = "0.59.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
dependencies = [
"windows-targets 0.52.6",
]
[[package]] [[package]]
name = "windows-sys" name = "windows-sys"
version = "0.60.2" version = "0.60.2"
@ -8974,6 +9203,23 @@ dependencies = [
"syn 2.0.117", "syn 2.0.117",
] ]
[[package]]
name = "zip"
version = "2.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50"
dependencies = [
"arbitrary",
"crc32fast",
"crossbeam-utils",
"displaydoc",
"flate2",
"indexmap 2.13.0",
"memchr",
"thiserror 2.0.18",
"zopfli",
]
[[package]] [[package]]
name = "zip" name = "zip"
version = "8.4.0" version = "8.4.0"

View File

@ -142,6 +142,12 @@ hostname = "0.4"
utoipa = { version = "5.4.0", features = ["chrono", "uuid"] } utoipa = { version = "5.4.0", features = ["chrono", "uuid"] }
rust_decimal = "1.40.0" rust_decimal = "1.40.0"
walkdir = "2.5.0" walkdir = "2.5.0"
calamine = "0.26"
csv = "1.3"
lopdf = "0.34"
pulldown-cmark = "0.12"
quick-xml = "0.37"
sqlparser = "0.55"
lazy_static = "1.5" lazy_static = "1.5"
moka = "0.12.15" moka = "0.12.15"
serde = "1.0.228" serde = "1.0.228"
@ -151,9 +157,7 @@ serde_bytes = "0.11.19"
phf = "0.13.1" phf = "0.13.1"
phf_codegen = "0.13.1" phf_codegen = "0.13.1"
base64 = "0.22.1" base64 = "0.22.1"
tempfile = "3"
[workspace.package] [workspace.package]
version = "0.2.9" version = "0.2.9"

View File

@ -54,6 +54,17 @@ futures = { workspace = true }
deadpool-redis = { workspace = true, features = ["rt_tokio_1", "cluster-async", "cluster"] } deadpool-redis = { workspace = true, features = ["rt_tokio_1", "cluster-async", "cluster"] }
moka = { workspace = true, features = ["future"] } moka = { workspace = true, features = ["future"] }
rust_decimal = { workspace = true } rust_decimal = { workspace = true }
calamine = { workspace = true }
csv = { workspace = true }
quick-xml = { workspace = true }
lopdf = { workspace = true }
pulldown-cmark = { workspace = true }
sqlparser = { workspace = true }
walkdir = { workspace = true }
zip = { workspace = true }
regex = { workspace = true }
flate2 = { workspace = true }
tempfile = { workspace = true }
[lints] [lints]
workspace = true workspace = true

View File

@ -0,0 +1,325 @@
//! read_csv — parse and query CSV files.
use crate::file_tools::MAX_FILE_SIZE;
use crate::git_tools::ctx::GitToolCtx;
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
use csv::ReaderBuilder;
use std::collections::HashMap;
async fn read_csv_exec(
ctx: GitToolCtx,
args: serde_json::Value,
) -> Result<serde_json::Value, String> {
let p: serde_json::Map<String, serde_json::Value> =
serde_json::from_value(args).map_err(|e| e.to_string())?;
let project_name = p
.get("project_name")
.and_then(|v| v.as_str())
.ok_or("missing project_name")?;
let repo_name = p
.get("repo_name")
.and_then(|v| v.as_str())
.ok_or("missing repo_name")?;
let path = p
.get("path")
.and_then(|v| v.as_str())
.ok_or("missing path")?;
let rev = p
.get("rev")
.and_then(|v| v.as_str())
.map(String::from)
.unwrap_or_else(|| "HEAD".to_string());
let delimiter = p
.get("delimiter")
.and_then(|v| v.as_str())
.and_then(|s| s.chars().next())
.unwrap_or(',');
let has_header = p
.get("has_header")
.and_then(|v| v.as_bool())
.unwrap_or(true);
let offset = p.get("offset").and_then(|v| v.as_u64()).unwrap_or(0) as usize;
let limit = p.get("limit").and_then(|v| v.as_u64()).unwrap_or(100) as usize;
let filter_col = p.get("filter_column").and_then(|v| v.as_str());
let filter_val = p.get("filter_value").and_then(|v| v.as_str());
let select_cols = p.get("columns").and_then(|v| v.as_array()).map(|a| {
a.iter()
.filter_map(|v| v.as_str().map(String::from))
.collect::<Vec<_>>()
});
let domain = ctx.open_repo(project_name, repo_name).await?;
let commit_oid = if rev.len() >= 40 {
git::commit::types::CommitOid::new(&rev)
} else {
domain
.commit_get_prefix(&rev)
.map_err(|e| e.to_string())?
.oid
};
let entry = domain
.tree_entry_by_path_from_commit(&commit_oid, path)
.map_err(|e| e.to_string())?;
let blob = domain.blob_get(&entry.oid).map_err(|e| e.to_string())?;
if blob.is_binary {
return Err("file is binary, not a CSV".to_string());
}
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
let data = &content.content;
if data.len() > MAX_FILE_SIZE {
return Err(format!(
"file too large ({} bytes), max {} bytes",
data.len(),
MAX_FILE_SIZE
));
}
let text = String::from_utf8_lossy(data);
let mut reader = ReaderBuilder::new()
.delimiter(delimiter as u8)
.has_headers(has_header)
.from_reader(text.as_bytes());
let headers: Vec<String> = if has_header {
reader
.headers()
.map_err(|e| e.to_string())?
.clone()
.into_iter()
.map(String::from)
.collect()
} else {
vec![]
};
let col_indices: Vec<usize> = if let Some(ref sel) = select_cols {
sel.iter()
.filter_map(|col| headers.iter().position(|h| h == col))
.collect()
} else {
(0..headers.len()).collect()
};
let _col_set: std::collections::HashSet<usize> = col_indices.iter().cloned().collect();
let filter_col_idx = filter_col.and_then(|c| headers.iter().position(|h| h == c));
let mut rows: Vec<serde_json::Value> = Vec::new();
let mut skipped = 0;
let mut total = 0;
for result in reader.records() {
let record = result.map_err(|e| e.to_string())?;
// Skip offset
if skipped < offset {
skipped += 1;
continue;
}
total += 1;
// Filter
if let (Some(fci), Some(fv)) = (filter_col_idx, filter_val) {
if record.get(fci) != Some(fv) {
continue;
}
}
// Select columns
let obj = if has_header {
let mut map = serde_json::Map::new();
for &idx in &col_indices {
let key = headers
.get(idx)
.cloned()
.unwrap_or_else(|| format!("col_{}", idx));
let val = record.get(idx).unwrap_or("").to_string();
map.insert(key, serde_json::Value::String(val));
}
serde_json::Value::Object(map)
} else {
let arr: Vec<String> = col_indices
.iter()
.map(|&idx| record.get(idx).unwrap_or("").to_string())
.collect();
serde_json::Value::Array(arr.into_iter().map(serde_json::Value::String).collect())
};
rows.push(obj);
if rows.len() >= limit {
break;
}
}
Ok(serde_json::json!({
"path": path,
"rev": rev,
"headers": if has_header { headers } else { vec![] },
"selected_columns": select_cols,
"rows": rows,
"row_count": rows.len(),
"total_available": total + offset,
"filter": if let (Some(c), Some(v)) = (filter_col, filter_val) {
serde_json::json!({ "column": c, "value": v })
} else { serde_json::Value::Null },
}))
}
pub fn register_csv_tools(registry: &mut ToolRegistry) {
let p = HashMap::from([
(
"project_name".into(),
ToolParam {
name: "project_name".into(),
param_type: "string".into(),
description: Some("Project name (slug)".into()),
required: true,
properties: None,
items: None,
},
),
(
"repo_name".into(),
ToolParam {
name: "repo_name".into(),
param_type: "string".into(),
description: Some("Repository name".into()),
required: true,
properties: None,
items: None,
},
),
(
"path".into(),
ToolParam {
name: "path".into(),
param_type: "string".into(),
description: Some("File path within the repository".into()),
required: true,
properties: None,
items: None,
},
),
(
"rev".into(),
ToolParam {
name: "rev".into(),
param_type: "string".into(),
description: Some("Git revision (default: HEAD)".into()),
required: false,
properties: None,
items: None,
},
),
(
"delimiter".into(),
ToolParam {
name: "delimiter".into(),
param_type: "string".into(),
description: Some("Field delimiter character (default: comma \",\")".into()),
required: false,
properties: None,
items: None,
},
),
(
"has_header".into(),
ToolParam {
name: "has_header".into(),
param_type: "boolean".into(),
description: Some("If true, first row is column headers (default: true)".into()),
required: false,
properties: None,
items: None,
},
),
(
"columns".into(),
ToolParam {
name: "columns".into(),
param_type: "array".into(),
description: Some("List of column names to select".into()),
required: false,
properties: None,
items: Some(Box::new(ToolParam {
name: "".into(),
param_type: "string".into(),
description: None,
required: false,
properties: None,
items: None,
})),
},
),
(
"filter_column".into(),
ToolParam {
name: "filter_column".into(),
param_type: "string".into(),
description: Some("Column name to filter by".into()),
required: false,
properties: None,
items: None,
},
),
(
"filter_value".into(),
ToolParam {
name: "filter_value".into(),
param_type: "string".into(),
description: Some("Value to match in filter_column".into()),
required: false,
properties: None,
items: None,
},
),
(
"offset".into(),
ToolParam {
name: "offset".into(),
param_type: "integer".into(),
description: Some("Number of rows to skip (default: 0)".into()),
required: false,
properties: None,
items: None,
},
),
(
"limit".into(),
ToolParam {
name: "limit".into(),
param_type: "integer".into(),
description: Some("Maximum rows to return (default: 100)".into()),
required: false,
properties: None,
items: None,
},
),
]);
let schema = ToolSchema {
schema_type: "object".into(),
properties: Some(p),
required: Some(vec![
"project_name".into(),
"repo_name".into(),
"path".into(),
]),
};
registry.register(
ToolDefinition::new("read_csv")
.description("Parse and query a CSV file. Supports header detection, column selection, filtering, pagination (offset/limit), and custom delimiters.")
.parameters(schema),
ToolHandler::new(|ctx, args| {
let gctx = GitToolCtx::new(ctx);
Box::pin(async move {
read_csv_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
})
}),
);
}

View File

@ -0,0 +1,184 @@
//! read_excel — parse and query Excel files (.xlsx, .xls).
use crate::file_tools::MAX_FILE_SIZE;
use crate::git_tools::ctx::GitToolCtx;
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
use calamine::{open_workbook, Reader, Xlsx};
use futures::FutureExt;
use std::collections::HashMap;
async fn read_excel_exec(
ctx: GitToolCtx,
args: serde_json::Value,
) -> Result<serde_json::Value, String> {
let p: serde_json::Map<String, serde_json::Value> =
serde_json::from_value(args).map_err(|e| e.to_string())?;
let project_name = p
.get("project_name")
.and_then(|v| v.as_str())
.ok_or("missing project_name")?;
let repo_name = p
.get("repo_name")
.and_then(|v| v.as_str())
.ok_or("missing repo_name")?;
let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
let rev = p
.get("rev")
.and_then(|v| v.as_str())
.map(String::from)
.unwrap_or_else(|| "HEAD".to_string());
let sheet_name = p.get("sheet_name").and_then(|v| v.as_str()).map(String::from);
let sheet_index = p.get("sheet_index").and_then(|v| v.as_u64()).map(|v| v as usize);
let offset = p.get("offset").and_then(|v| v.as_u64()).unwrap_or(0) as usize;
let limit = p
.get("limit")
.and_then(|v| v.as_u64())
.unwrap_or(100) as usize;
let has_header = p
.get("has_header")
.and_then(|v| v.as_bool())
.unwrap_or(true);
let domain = ctx.open_repo(project_name, repo_name).await?;
let commit_oid = if rev.len() >= 40 {
git::commit::types::CommitOid::new(&rev)
} else {
domain
.commit_get_prefix(&rev)
.map_err(|e| e.to_string())?
.oid
};
let entry = domain
.tree_entry_by_path_from_commit(&commit_oid, path)
.map_err(|e| e.to_string())?;
let blob = domain.blob_get(&entry.oid).map_err(|e| e.to_string())?;
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
let data = &content.content;
if data.len() > MAX_FILE_SIZE {
return Err(format!(
"file too large ({} bytes), max {} bytes",
data.len(),
MAX_FILE_SIZE
));
}
// Use cursor-based reading to avoid tempfile
let cursor = std::io::Cursor::new(data.clone());
let mut workbook: Xlsx<std::io::Cursor<Vec<u8>>> =
open_workbook(cursor).map_err(|e| format!("failed to open Excel: {}", e))?;
let sheet_names = workbook.sheet_names().to_vec();
// Determine which sheet to read
let sheet_idx = match (sheet_name.clone(), sheet_index) {
(Some(name), _) => sheet_names
.iter()
.position(|n| n == &name)
.ok_or_else(|| format!("sheet '{}' not found. Available: {:?}", name, sheet_names))?,
(_, Some(idx)) => {
if idx >= sheet_names.len() {
return Err(format!(
"sheet index {} out of range (0..{})",
idx,
sheet_names.len()
));
}
idx
}
_ => 0,
};
let range = workbook
.worksheet_range_at(sheet_idx)
.map_err(|e| format!("failed to read sheet: {}", e))?;
let rows: Vec<Vec<serde_json::Value>> = range
.rows()
.skip(if has_header { offset + 1 } else { offset })
.take(limit)
.map(|row| {
row.iter()
.map(|cell| {
use calamine::Data;
match cell {
Data::Int(i) => serde_json::Value::Number((*i).into()),
Data::Float(f) => {
serde_json::json!(f)
}
Data::String(s) => serde_json::Value::String(s.clone()),
Data::Bool(b) => serde_json::Value::Bool(*b),
Data::DateTime(dt) => {
serde_json::Value::String(format!("{:?}", dt))
}
Data::DateTimeIso(s) => serde_json::Value::String(s.clone()),
Data::DurationIso(s) => serde_json::Value::String(s.clone()),
Data::Error(e) => serde_json::json!({ "error": format!("{:?}", e) }),
Data::Empty => serde_json::Value::Null,
}
})
.collect()
})
.collect();
let header_row: Vec<String> = if has_header {
range
.rows()
.next()
.map(|row| {
row.iter()
.map(|c| {
if let calamine::Data::String(s) = c {
s.clone()
} else {
String::new()
}
})
.collect()
})
.unwrap_or_default()
} else {
vec![]
};
Ok(serde_json::json!({
"path": path,
"rev": rev,
"sheets": sheet_names,
"active_sheet": sheet_names.get(sheet_idx).cloned(),
"sheet_index": sheet_idx,
"headers": header_row,
"rows": rows,
"row_count": rows.len(),
"total_rows": range.rows().count().saturating_sub(if has_header { 1 } else { 0 }),
}))
}
pub fn register_excel_tools(registry: &mut ToolRegistry) {
let p = HashMap::from([
("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path within the repository (supports .xlsx, .xls)".into()), required: true, properties: None, items: None }),
("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }),
("sheet_name".into(), ToolParam { name: "sheet_name".into(), param_type: "string".into(), description: Some("Sheet name to read. Defaults to first sheet.".into()), required: false, properties: None, items: None }),
("sheet_index".into(), ToolParam { name: "sheet_index".into(), param_type: "integer".into(), description: Some("Sheet index (0-based). Ignored if sheet_name is set.".into()), required: false, properties: None, items: None }),
("has_header".into(), ToolParam { name: "has_header".into(), param_type: "boolean".into(), description: Some("If true, first row is column headers (default: true)".into()), required: false, properties: None, items: None }),
("offset".into(), ToolParam { name: "offset".into(), param_type: "integer".into(), description: Some("Number of rows to skip (default: 0)".into()), required: false, properties: None, items: None }),
("limit".into(), ToolParam { name: "limit".into(), param_type: "integer".into(), description: Some("Maximum rows to return (default: 100)".into()), required: false, properties: None, items: None }),
]);
let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
registry.register(
ToolDefinition::new("read_excel")
.description("Parse and query Excel spreadsheets (.xlsx, .xls). Returns sheet names, headers, and rows with support for sheet selection and pagination.")
.parameters(schema),
ToolHandler::new(|ctx, args| {
let gctx = GitToolCtx::new(ctx);
Box::pin(async move {
read_excel_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
})
}),
);
}

View File

@ -0,0 +1,341 @@
//! git_grep — search repository files for patterns.
use crate::file_tools::MAX_FILE_SIZE;
use crate::git_tools::ctx::GitToolCtx;
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
use regex::RegexBuilder;
use std::collections::HashMap;
/// Text file extensions to search (skip binary files).
const TEXT_EXTS: &[&str] = &[
"rs", "toml", "yaml", "yml", "json", "jsonc", "js", "jsx", "ts", "tsx",
"css", "scss", "less", "html", "htm", "xml", "svg", "vue", "svelte",
"py", "rb", "go", "java", "kt", "swift", "c", "cpp", "h", "hpp",
"cs", "php", "pl", "sh", "bash", "zsh", "fish", "ps1", "bat", "cmd",
"sql", "md", "markdown", "rst", "txt", "log", "ini", "cfg", "conf",
"dockerfile", "makefile", "cmake", "gradle", "properties", "env",
"proto", "graphql", "vue", "lock",
];
fn is_text_ext(path: &str) -> bool {
let lower = path.to_lowercase();
TEXT_EXTS.iter().any(|&e| lower.ends_with(&format!(".{}", e)))
}
fn is_binary_content(data: &[u8]) -> bool {
data.iter().take(8192).any(|&b| b == 0)
}
async fn git_grep_exec(
ctx: GitToolCtx,
args: serde_json::Value,
) -> Result<serde_json::Value, String> {
let p: serde_json::Map<String, serde_json::Value> =
serde_json::from_value(args).map_err(|e| e.to_string())?;
let project_name = p
.get("project_name")
.and_then(|v| v.as_str())
.ok_or("missing project_name")?;
let repo_name = p
.get("repo_name")
.and_then(|v| v.as_str())
.ok_or("missing repo_name")?;
let rev = p
.get("rev")
.and_then(|v| v.as_str())
.map(String::from)
.unwrap_or_else(|| "HEAD".to_string());
let pattern = p
.get("pattern")
.and_then(|v| v.as_str())
.ok_or("missing pattern")?;
let glob = p.get("glob").and_then(|v| v.as_str()).map(String::from);
let is_regex = p
.get("is_regex")
.and_then(|v| v.as_bool())
.unwrap_or(true);
let context_lines = p
.get("context_lines")
.and_then(|v| v.as_u64())
.unwrap_or(0) as usize;
let max_results = p
.get("max_results")
.and_then(|v| v.as_u64())
.unwrap_or(100) as usize;
let domain = ctx.open_repo(project_name, repo_name).await?;
// Resolve revision to commit oid
let commit_oid = if rev.len() >= 40 {
git::commit::types::CommitOid::new(&rev)
} else {
domain
.commit_get_prefix(&rev)
.map_err(|e| e.to_string())?
.oid
};
let regex = if is_regex {
RegexBuilder::new(pattern)
.case_insensitive(true)
.build()
.map_err(|e| format!("invalid regex '{}': {}", pattern, e))?
} else {
// Escape for literal search
RegexBuilder::new(&regex::escape(pattern))
.case_insensitive(true)
.build()
.map_err(|e| e.to_string())?
};
// Recursive tree walk using git2
let repo = domain.repo();
let commit = repo
.find_commit(commit_oid.to_oid().map_err(|e| e.to_string())?)
.map_err(|e| e.to_string())?;
let tree = commit.tree().map_err(|e| e.to_string())?;
let mut results: Vec<serde_json::Value> = Vec::new();
// Stack: (tree, current_path_prefix)
let mut stack: Vec<(git2::Tree<'_>, String)> = vec![(tree, String::new())];
while let Some((current_tree, current_prefix)) = stack.pop() {
for entry in current_tree.iter() {
let name = entry.name().unwrap_or_default();
if name.is_empty() {
continue;
}
let path: String = if current_prefix.is_empty() {
name.to_string()
} else {
format!("{}/{}", current_prefix, name)
};
if entry.kind() == Some(git2::ObjectType::Tree) {
if let Some(subtree) = entry.to_object(&repo).ok().and_then(|o| o.into_tree().ok()) {
stack.push((subtree, path));
}
continue;
}
if entry.kind() != Some(git2::ObjectType::Blob) {
continue;
}
// Glob filter
if let Some(ref g) = glob {
if !glob_match(&path, g) {
continue;
}
} else if !is_text_ext(&path) {
continue;
}
// Read blob content
let blob = match entry.to_object(&repo).ok().and_then(|o| o.into_blob().ok()) {
Some(b) => b,
None => continue,
};
let size = blob.size();
if size == 0 || size > MAX_FILE_SIZE {
continue;
}
let data = blob.content();
if is_binary_content(data) {
continue;
}
let content = match String::from_utf8(data.to_vec()) {
Ok(s) => s,
Err(_) => continue,
};
// Search line by line
let lines: Vec<&str> = content.lines().collect();
for (line_idx, line) in lines.iter().enumerate() {
if regex.is_match(line) {
let start = line_idx.saturating_sub(context_lines);
let end = (line_idx + context_lines + 1).min(lines.len());
let context: Vec<String> = lines[start..end]
.iter()
.enumerate()
.map(|(i, l)| {
let line_num = start + i + 1;
let prefix = if start + i == line_idx { ">" } else { " " };
format!("{}{}: {}", prefix, line_num, l)
})
.collect();
results.push(serde_json::json!({
"file": path,
"line_number": line_idx + 1,
"match": line,
"context": context.join("\n"),
}));
if results.len() >= max_results {
return Ok(serde_json::json!({
"query": pattern,
"rev": rev,
"total_matches": results.len(),
"truncated": true,
"results": results
}));
}
}
}
}
}
Ok(serde_json::json!({
"query": pattern,
"rev": rev,
"total_matches": results.len(),
"truncated": false,
"results": results
}))
}
fn glob_match(path: &str, pattern: &str) -> bool {
// Simple glob: support *, ?, **
let parts: Vec<&str> = pattern.split('/').collect();
let path_parts: Vec<&str> = path.split('/').collect();
let _path_lower = path.to_lowercase();
let pattern_lower = pattern.to_lowercase();
fn matches_part(path_part: &str, pattern_part: &str) -> bool {
if pattern_part.is_empty() || pattern_part == "*" {
return true;
}
if pattern_part == "**" {
return true;
}
if let Some(star) = pattern_part.find('*') {
let (prefix, suffix) = pattern_part.split_at(star);
let suffix = if suffix.starts_with('*') { &suffix[1..] } else { suffix };
if !prefix.is_empty() && !path_part.starts_with(prefix) {
return false;
}
if !suffix.is_empty() && !path_part.ends_with(suffix) {
return false;
}
return true;
}
path_part == pattern_part
}
if parts.len() == 1 {
// Simple glob pattern on filename only
let file_name = path_parts.last().unwrap_or(&"");
return matches_part(file_name, &pattern_lower);
}
// Multi-part glob
let mut pi = 0;
for part in &parts {
while pi < path_parts.len() {
if matches_part(path_parts[pi], part) {
pi += 1;
break;
}
if *part != "**" {
return false;
}
pi += 1;
}
}
true
}
pub fn register_grep_tools(registry: &mut ToolRegistry) {
let p = HashMap::from([
("project_name".into(), ToolParam {
name: "project_name".into(),
param_type: "string".into(),
description: Some("Project name (slug)".into()),
required: true,
properties: None,
items: None,
}),
("repo_name".into(), ToolParam {
name: "repo_name".into(),
param_type: "string".into(),
description: Some("Repository name".into()),
required: true,
properties: None,
items: None,
}),
("pattern".into(), ToolParam {
name: "pattern".into(),
param_type: "string".into(),
description: Some("Search pattern (regex or literal string)".into()),
required: true,
properties: None,
items: None,
}),
("rev".into(), ToolParam {
name: "rev".into(),
param_type: "string".into(),
description: Some("Git revision to search in (branch, tag, commit). Default: HEAD".into()),
required: false,
properties: None,
items: None,
}),
("glob".into(), ToolParam {
name: "glob".into(),
param_type: "string".into(),
description: Some("File glob pattern to filter (e.g. *.rs, src/**/*.ts)".into()),
required: false,
properties: None,
items: None,
}),
("is_regex".into(), ToolParam {
name: "is_regex".into(),
param_type: "boolean".into(),
description: Some("If true, pattern is a regex. If false, literal string. Default: true".into()),
required: false,
properties: None,
items: None,
}),
("context_lines".into(), ToolParam {
name: "context_lines".into(),
param_type: "integer".into(),
description: Some("Number of surrounding lines to include for each match. Default: 0".into()),
required: false,
properties: None,
items: None,
}),
("max_results".into(), ToolParam {
name: "max_results".into(),
param_type: "integer".into(),
description: Some("Maximum number of matches to return. Default: 100".into()),
required: false,
properties: None,
items: None,
}),
]);
let schema = ToolSchema {
schema_type: "object".into(),
properties: Some(p),
required: Some(vec!["project_name".into(), "repo_name".into(), "pattern".into()]),
};
registry.register(
ToolDefinition::new("git_grep")
.description("Search for a text pattern across all files in a repository at a given revision. Supports regex, glob filtering, and line-level context. Skips binary files automatically.")
.parameters(schema),
ToolHandler::new(|ctx, args| {
let gctx = GitToolCtx::new(ctx);
Box::pin(async move {
git_grep_exec(gctx, args)
.await
.map_err(agent::ToolError::ExecutionError)
})
}),
);
}

View File

@ -0,0 +1,275 @@
//! read_json — parse, validate, and query JSON / JSONC files.
use crate::file_tools::MAX_FILE_SIZE;
use crate::git_tools::ctx::GitToolCtx;
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
use serde_json::Value as JsonValue;
use std::collections::HashMap;
/// Remove comments from JSONC (lines starting with // or /* */) for parsing.
fn strip_jsonc_comments(input: &str) -> String {
let mut result = String::with_capacity(input.len());
let mut chars = input.chars().peekable();
let mut in_string = false;
let mut escaped = false;
while let Some(c) = chars.next() {
if escaped {
result.push(c);
escaped = false;
continue;
}
if c == '\\' && in_string {
result.push(c);
escaped = true;
continue;
}
if c == '"' {
result.push(c);
in_string = !in_string;
continue;
}
if !in_string {
if c == '/' {
if let Some(&next) = chars.peek() {
if next == '/' {
// Line comment — skip to end of line
chars.next();
while let Some(nc) = chars.next() {
if nc == '\n' {
result.push(nc);
break;
}
}
continue;
} else if next == '*' {
// Block comment — skip until */
chars.next();
while let Some(nc) = chars.next() {
if nc == '*' {
if let Some(&'/') = chars.peek() {
chars.next();
break;
}
}
}
continue;
}
}
}
}
result.push(c);
}
result
}
fn infer_schema(value: &JsonValue, max_depth: usize) -> JsonValue {
if max_depth == 0 {
return serde_json::json!({ "type": "MAX_DEPTH" });
}
match value {
JsonValue::Null => serde_json::json!({ "type": "null" }),
JsonValue::Bool(_) => serde_json::json!({ "type": "boolean" }),
JsonValue::Number(_) => serde_json::json!({ "type": "number" }),
JsonValue::String(_) => serde_json::json!({ "type": "string" }),
JsonValue::Array(arr) => {
if arr.is_empty() {
serde_json::json!({ "type": "array", "items": null })
} else {
serde_json::json!({
"type": "array",
"length": arr.len(),
"items": infer_schema(&arr[0], max_depth - 1)
})
}
}
JsonValue::Object(obj) => {
let mut schema = serde_json::Map::new();
schema.insert("type".into(), serde_json::Value::String("object".into()));
let mut properties = serde_json::Map::new();
for (k, v) in obj {
properties.insert(k.clone(), infer_schema(v, max_depth - 1));
}
schema.insert("properties".into(), serde_json::Value::Object(properties));
schema.insert("keyCount".into(), serde_json::json!(obj.len()));
serde_json::Value::Object(schema)
}
}
}
async fn read_json_exec(
ctx: GitToolCtx,
args: serde_json::Value,
) -> Result<serde_json::Value, String> {
let p: serde_json::Map<String, serde_json::Value> =
serde_json::from_value(args).map_err(|e| e.to_string())?;
let project_name = p
.get("project_name")
.and_then(|v| v.as_str())
.ok_or("missing project_name")?;
let repo_name = p
.get("repo_name")
.and_then(|v| v.as_str())
.ok_or("missing repo_name")?;
let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
let rev = p
.get("rev")
.and_then(|v| v.as_str())
.map(String::from)
.unwrap_or_else(|| "HEAD".to_string());
let query = p.get("query").and_then(|v| v.as_str()).map(String::from);
let max_depth = p.get("schema_depth").and_then(|v| v.as_u64()).unwrap_or(4) as usize;
let pretty = p.get("pretty").and_then(|v| v.as_bool()).unwrap_or(false);
let domain = ctx.open_repo(project_name, repo_name).await?;
let commit_oid = if rev.len() >= 40 {
git::commit::types::CommitOid::new(&rev)
} else {
domain
.commit_get_prefix(&rev)
.map_err(|e| e.to_string())?
.oid
};
let entry = domain
.tree_entry_by_path_from_commit(&commit_oid, path)
.map_err(|e| e.to_string())?;
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
let data = &content.content;
if data.len() > MAX_FILE_SIZE {
return Err(format!(
"file too large ({} bytes), max {} bytes",
data.len(),
MAX_FILE_SIZE
));
}
let text = String::from_utf8_lossy(data);
let is_jsonc = path.ends_with(".jsonc") || path.ends_with(".vscodeignore") || text.contains("//");
let json_text = if is_jsonc {
strip_jsonc_comments(&text)
} else {
text.to_string()
};
let parsed: JsonValue = serde_json::from_str(&json_text)
.map_err(|e| format!("JSON parse error at {}: {}", e.line(), e))?;
// Apply JSONPath-like query
let result = if let Some(ref q) = query {
query_json(&parsed, q)?
} else {
parsed
};
let schema = infer_schema(&result, max_depth);
let display = if pretty {
serde_json::to_string_pretty(&result).unwrap_or_default()
} else {
serde_json::to_string(&result).unwrap_or_default()
};
Ok(serde_json::json!({
"path": path,
"rev": rev,
"format": if is_jsonc { "jsonc" } else { "json" },
"size_bytes": data.len(),
"schema": schema,
"data": if display.chars().count() > 5000 {
format!("{}... (truncated, {} chars total)", &display[..5000], display.chars().count())
} else { display },
}))
}
/// Simple JSONPath-like query support.
/// Supports: $.key, $[0], $.key.nested, $.arr[0].field
fn query_json(value: &JsonValue, query: &str) -> Result<JsonValue, String> {
let query = query.trim();
let query = if query.starts_with("$.") {
&query[2..]
} else if query.starts_with('$') && query.len() > 1 {
&query[1..]
} else {
query
};
let mut current = value.clone();
for part in query.split('.') {
if part.is_empty() {
continue;
}
// Handle array index like [0]
if let Some(idx_start) = part.find('[') {
let key = &part[..idx_start];
if !key.is_empty() {
if let JsonValue::Object(obj) = &current {
current = obj.get(key).cloned().unwrap_or(JsonValue::Null);
} else {
return Err(format!("cannot access property '{}' on non-object", key));
}
}
let rest = &part[idx_start..];
for bracket in rest.split_inclusive(']') {
if bracket.is_empty() || bracket == "]" {
continue;
}
let inner = bracket.trim_end_matches(']');
if let Some(idx) = inner.strip_prefix('[') {
if let Ok(index) = idx.parse::<usize>() {
if let JsonValue::Array(arr) = &current {
current = arr.get(index).cloned().unwrap_or(JsonValue::Null);
} else {
return Err(format!("index {} on non-array", index));
}
}
}
}
} else {
if let JsonValue::Object(obj) = &current {
current = obj.get(part).cloned().unwrap_or(JsonValue::Null);
} else {
return Err(format!("property '{}' not found", part));
}
}
}
Ok(current)
}
pub fn register_json_tools(registry: &mut ToolRegistry) {
let p = HashMap::from([
("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the JSON or JSONC file".into()), required: true, properties: None, items: None }),
("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }),
("query".into(), ToolParam { name: "query".into(), param_type: "string".into(), description: Some("JSONPath-like query (e.g. $.config.items[0].name) to extract a subset of the document".into()), required: false, properties: None, items: None }),
("schema_depth".into(), ToolParam { name: "schema_depth".into(), param_type: "integer".into(), description: Some("How deep to infer the JSON schema (default: 4)".into()), required: false, properties: None, items: None }),
("pretty".into(), ToolParam { name: "pretty".into(), param_type: "boolean".into(), description: Some("Pretty-print the output (default: false)".into()), required: false, properties: None, items: None }),
]);
let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
registry.register(
ToolDefinition::new("read_json")
.description("Parse, validate, and query JSON and JSONC files. Supports JSONPath-like queries ($.key, $.arr[0]), schema inference, and pretty-printing. Automatically detects JSONC (with // comments).")
.parameters(schema),
ToolHandler::new(|ctx, args| {
let gctx = GitToolCtx::new(ctx);
Box::pin(async move {
read_json_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
})
}),
);
}

View File

@ -0,0 +1,286 @@
//! read_markdown — parse and analyze Markdown files.
use crate::file_tools::MAX_FILE_SIZE;
use crate::git_tools::ctx::GitToolCtx;
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Parser, Tag, TagEnd};
use std::collections::HashMap;
async fn read_markdown_exec(
ctx: GitToolCtx,
args: serde_json::Value,
) -> Result<serde_json::Value, String> {
let p: serde_json::Map<String, serde_json::Value> =
serde_json::from_value(args).map_err(|e| e.to_string())?;
let project_name = p
.get("project_name")
.and_then(|v| v.as_str())
.ok_or("missing project_name")?;
let repo_name = p
.get("repo_name")
.and_then(|v| v.as_str())
.ok_or("missing repo_name")?;
let path = p
.get("path")
.and_then(|v| v.as_str())
.ok_or("missing path")?;
let rev = p
.get("rev")
.and_then(|v| v.as_str())
.map(String::from)
.unwrap_or_else(|| "HEAD".to_string());
let include_code = p
.get("include_code")
.and_then(|v| v.as_bool())
.unwrap_or(true);
let sections_only = p
.get("sections_only")
.and_then(|v| v.as_bool())
.unwrap_or(false);
let domain = ctx.open_repo(project_name, repo_name).await?;
let commit_oid = if rev.len() >= 40 {
git::commit::types::CommitOid::new(&rev)
} else {
domain
.commit_get_prefix(&rev)
.map_err(|e| e.to_string())?
.oid
};
let entry = domain
.tree_entry_by_path_from_commit(&commit_oid, path)
.map_err(|e| e.to_string())?;
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
let data = &content.content;
if data.len() > MAX_FILE_SIZE {
return Err(format!(
"file too large ({} bytes), max {} bytes",
data.len(),
MAX_FILE_SIZE
));
}
let text = String::from_utf8_lossy(data);
let parser = Parser::new(&text);
let mut sections: Vec<serde_json::Value> = Vec::new();
let mut code_blocks: Vec<serde_json::Value> = Vec::new();
let mut links: Vec<serde_json::Value> = Vec::new();
let mut images: Vec<serde_json::Value> = Vec::new();
let mut current_heading_level: Option<u32> = None;
let mut current_heading_text = String::new();
let mut in_code_block = false;
let mut code_block_lang = String::new();
let mut code_block_content = String::new();
let mut toc: Vec<serde_json::Value> = Vec::new();
for event in parser {
match event {
Event::Start(Tag::Heading { level, .. }) => {
current_heading_level = Some(match level {
HeadingLevel::H1 => 1,
HeadingLevel::H2 => 2,
HeadingLevel::H3 => 3,
HeadingLevel::H4 => 4,
HeadingLevel::H5 => 5,
HeadingLevel::H6 => 6,
});
current_heading_text.clear();
}
Event::End(TagEnd::Heading(level)) => {
let lvl = match level {
HeadingLevel::H1 => 1,
HeadingLevel::H2 => 2,
HeadingLevel::H3 => 3,
HeadingLevel::H4 => 4,
HeadingLevel::H5 => 5,
HeadingLevel::H6 => 6,
};
let heading = current_heading_text.trim().to_string();
if !heading.is_empty() {
let section = serde_json::json!({
"level": lvl,
"title": heading,
});
toc.push(section.clone());
if !sections_only {
sections.push(serde_json::json!({
"level": lvl,
"title": heading,
"content": "",
}));
}
}
current_heading_level = None;
}
Event::Text(text) => {
if in_code_block {
code_block_content.push_str(&text);
code_block_content.push('\n');
} else if let Some(_) = current_heading_level {
current_heading_text.push_str(&text);
current_heading_text.push(' ');
}
}
Event::Code(code) => {
code_blocks.push(serde_json::json!({
"language": "",
"code": code.as_ref(),
}));
}
Event::Start(Tag::CodeBlock(kind)) => {
in_code_block = true;
code_block_content.clear();
code_block_lang = match kind {
CodeBlockKind::Fenced(info) => info.as_ref().to_string(),
CodeBlockKind::Indented => String::new(),
};
}
Event::End(TagEnd::CodeBlock) => {
in_code_block = false;
if include_code {
code_blocks.push(serde_json::json!({
"language": code_block_lang,
"code": code_block_content.trim().to_string(),
}));
}
code_block_lang.clear();
}
Event::Start(Tag::Link { dest_url, .. }) => {
links.push(serde_json::json!({ "url": dest_url.to_string() }));
}
Event::Start(Tag::Image { dest_url, .. }) => {
images.push(serde_json::json!({ "url": dest_url.to_string() }));
}
_ => {}
}
}
// Build outline (h1/h2/h3 only)
let outline: Vec<serde_json::Value> = toc
.iter()
.filter(|s| {
let lvl = s.get("level").and_then(|v| v.as_u64()).unwrap_or(0) as u32;
lvl <= 3
})
.cloned()
.collect();
Ok(serde_json::json!({
"path": path,
"rev": rev,
"stats": {
"chars": text.chars().count(),
"words": text.split_whitespace().count(),
"lines": text.lines().count(),
"headings": toc.len(),
"code_blocks": code_blocks.len(),
"links": links.len(),
"images": images.len(),
},
"outline": outline,
"headings": toc,
"code_blocks": if include_code { code_blocks } else { vec![] },
"links": links,
"images": images,
}))
}
pub fn register_markdown_tools(registry: &mut ToolRegistry) {
let p = HashMap::from([
(
"project_name".into(),
ToolParam {
name: "project_name".into(),
param_type: "string".into(),
description: Some("Project name (slug)".into()),
required: true,
properties: None,
items: None,
},
),
(
"repo_name".into(),
ToolParam {
name: "repo_name".into(),
param_type: "string".into(),
description: Some("Repository name".into()),
required: true,
properties: None,
items: None,
},
),
(
"path".into(),
ToolParam {
name: "path".into(),
param_type: "string".into(),
description: Some("File path to the Markdown file".into()),
required: true,
properties: None,
items: None,
},
),
(
"rev".into(),
ToolParam {
name: "rev".into(),
param_type: "string".into(),
description: Some("Git revision (default: HEAD)".into()),
required: false,
properties: None,
items: None,
},
),
(
"sections_only".into(),
ToolParam {
name: "sections_only".into(),
param_type: "boolean".into(),
description: Some(
"If true, return only section headings (outline). Default: false".into(),
),
required: false,
properties: None,
items: None,
},
),
(
"include_code".into(),
ToolParam {
name: "include_code".into(),
param_type: "boolean".into(),
description: Some("Include code blocks in result. Default: true".into()),
required: false,
properties: None,
items: None,
},
),
]);
let schema = ToolSchema {
schema_type: "object".into(),
properties: Some(p),
required: Some(vec![
"project_name".into(),
"repo_name".into(),
"path".into(),
]),
};
registry.register(
ToolDefinition::new("read_markdown")
.description("Parse and analyze a Markdown file. Returns document statistics, heading outline, code blocks with languages, links, and images.")
.parameters(schema),
ToolHandler::new(|ctx, args| {
let gctx = GitToolCtx::new(ctx);
Box::pin(async move {
read_markdown_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
})
}),
);
}

View File

@ -0,0 +1,39 @@
//! File reading and search tools for AI agents.
//!
//! Tools for reading structured files (CSV, Excel, Word, PDF, PPT, Markdown,
//! SQL, JSON) and searching across repository files (git_grep).
//!
//! All tools operate on repository blobs (read via git context) or standalone
//! content, returning structured JSON suitable for AI consumption.
pub mod csv;
// TODO: fix calamine 0.26 API compatibility (open_workbook path requirement)
// pub mod excel;
pub mod grep;
pub mod json;
pub mod markdown;
// TODO: fix lopdf 0.34 API (no load_from_mem, different stream API)
// pub mod pdf;
// TODO: fix ppt archive borrow checker issue
// pub mod ppt;
pub mod sql;
// TODO: fix quick-xml 0.37 + zip Cursor API
// pub mod word;
use agent::ToolRegistry;
/// Maximum number of bytes to read from any single file (prevents huge blobs).
const MAX_FILE_SIZE: usize = 2 * 1024 * 1024; // 2MB
/// Registers all file tools into a ToolRegistry.
pub fn register_all(registry: &mut ToolRegistry) {
grep::register_grep_tools(registry);
csv::register_csv_tools(registry);
// excel::register_excel_tools(registry);
// word::register_word_tools(registry);
// pdf::register_pdf_tools(registry);
// ppt::register_ppt_tools(registry);
markdown::register_markdown_tools(registry);
sql::register_sql_tools(registry);
json::register_json_tools(registry);
}

View File

@ -0,0 +1,244 @@
//! read_pdf — extract text from PDF files.
use crate::file_tools::MAX_FILE_SIZE;
use crate::git_tools::ctx::GitToolCtx;
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
use futures::FutureExt;
use lopdf::{Document, Object, ObjectId};
use std::collections::HashMap;
/// Extract text content from a PDF page's content stream.
fn extract_page_text(doc: &Document, page_id: ObjectId) -> String {
let mut text = String::new();
// Get page dictionary
let page_dict = match doc.get(page_id) {
Ok(dict) => dict,
Err(_) => return text,
};
// Get content streams (can be a single stream or array)
let content_streams = match page_dict.get(b"Contents") {
Ok(obj) => obj.clone(),
Err(_) => return text,
};
let stream_ids: Vec<ObjectId> = match &content_streams {
Object::Reference(id) => vec![*id],
Object::Array(arr) => arr
.iter()
.filter_map(|o| {
if let Object::Reference(id) = o {
Some(*id)
} else {
None
}
})
.collect(),
_ => return text,
};
for stream_id in stream_ids {
if let Ok((_, stream)) = doc.get_stream(stream_id) {
// Decode the stream
if let Ok(decompressed) = stream.decompressed_content() {
text.push_str(&extract_text_from_content(&decompress_pdf_stream(&decompressed)));
text.push('\n');
}
}
}
text
}
/// Very simple PDF content stream text extraction.
/// Handles Tj, TJ, Td, T*, ', " operators.
fn extract_text_from_content(content: &[u8]) -> String {
let data = String::from_utf8_lossy(content);
let mut result = String::new();
let mut in_parens = false;
let mut current_text = String::new();
let mut last_was_tj = false;
let mut chars = data.chars().peekable();
while let Some(c) = chars.next() {
match c {
'(' => {
in_parens = true;
current_text.clear();
}
')' if in_parens => {
in_parens = false;
if !current_text.is_empty() {
if last_was_tj {
// TJ operator: subtract current text width offset
}
result.push_str(&current_text);
result.push(' ');
last_was_tj = false;
}
}
c if in_parens => {
if c == '\\' {
if let Some(escaped) = chars.next() {
match escaped {
'n' => current_text.push('\n'),
'r' => current_text.push('\r'),
't' => current_text.push('\t'),
_ => current_text.push(escaped),
}
}
} else {
current_text.push(c);
}
}
'%' => {
// Comment, skip to end of line
while let Some(nc) = chars.next() {
if nc == '\n' || nc == '\r' {
break;
}
}
}
_ => {}
}
}
// Clean up excessive newlines
let lines: Vec<&str> = result.lines().map(|l| l.trim()).filter(|l| !l.is_empty()).collect();
lines.join("\n")
}
fn decompress_pdf_stream(data: &[u8]) -> Vec<u8> {
// Try to detect and decompress flate/zlib streams
if data.len() < 2 {
return data.to_vec();
}
// Simple zlib check: zlib-wrapped deflate starts with 0x78
if data.starts_with(&[0x78]) || data.starts_with(&[0x08, 0x1b]) {
if let Ok(decoded) = flate2::read::ZlibDecoder::new(data).bytes().collect::<Result<Vec<_>, _>>() {
return decoded;
}
}
// Try raw deflate
if let Ok(decoded) = flate2::read::DeflateDecoder::new(data).bytes().collect::<Result<Vec<_>, _>>() {
return decoded;
}
data.to_vec()
}
async fn read_pdf_exec(
ctx: GitToolCtx,
args: serde_json::Value,
) -> Result<serde_json::Value, String> {
let p: serde_json::Map<String, serde_json::Value> =
serde_json::from_value(args).map_err(|e| e.to_string())?;
let project_name = p
.get("project_name")
.and_then(|v| v.as_str())
.ok_or("missing project_name")?;
let repo_name = p
.get("repo_name")
.and_then(|v| v.as_str())
.ok_or("missing repo_name")?;
let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
let rev = p
.get("rev")
.and_then(|v| v.as_str())
.map(String::from)
.unwrap_or_else(|| "HEAD".to_string());
let page_start = p.get("page_start").and_then(|v| v.as_u64()).map(|v| v as usize);
let page_end = p.get("page_end").and_then(|v| v.as_u64()).map(|v| v as usize);
let max_pages = p
.get("max_pages")
.and_then(|v| v.as_u64())
.unwrap_or(20) as usize;
let domain = ctx.open_repo(project_name, repo_name).await?;
let commit_oid = if rev.len() >= 40 {
git::commit::types::CommitOid::new(&rev)
} else {
domain
.commit_get_prefix(&rev)
.map_err(|e| e.to_string())?
.oid
};
let entry = domain
.tree_entry_by_path_from_commit(&commit_oid, path)
.map_err(|e| e.to_string())?;
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
let data = &content.content;
if data.len() > MAX_FILE_SIZE {
return Err(format!(
"file too large ({} bytes), max {} bytes",
data.len(),
MAX_FILE_SIZE
));
}
let doc = Document::load_from_mem(data)
.map_err(|e| format!("failed to parse PDF: {}", e))?;
// Get all page references
let pages: Vec<ObjectId> = doc
.pages
.values()
.cloned()
.collect();
let total_pages = pages.len();
let start = page_start.unwrap_or(0).min(total_pages.saturating_sub(1));
let end = page_end.unwrap_or(start + max_pages).min(total_pages);
let mut page_texts: Vec<serde_json::Value> = Vec::new();
for (i, page_id) in pages.iter().enumerate().skip(start).take(end - start) {
let text = extract_page_text(&doc, *page_id);
page_texts.push(serde_json::json!({
"page": i + 1,
"text": text,
"char_count": text.chars().count(),
}));
}
Ok(serde_json::json!({
"path": path,
"rev": rev,
"total_pages": total_pages,
"extracted_pages": page_texts.len(),
"pages": page_texts,
}))
}
pub fn register_pdf_tools(registry: &mut ToolRegistry) {
let p = HashMap::from([
("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the PDF document".into()), required: true, properties: None, items: None }),
("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }),
("page_start".into(), ToolParam { name: "page_start".into(), param_type: "integer".into(), description: Some("1-based starting page number (default: 1)".into()), required: false, properties: None, items: None }),
("page_end".into(), ToolParam { name: "page_end".into(), param_type: "integer".into(), description: Some("1-based ending page number (default: page_start + 20)".into()), required: false, properties: None, items: None }),
("max_pages".into(), ToolParam { name: "max_pages".into(), param_type: "integer".into(), description: Some("Maximum number of pages to extract (default: 20)".into()), required: false, properties: None, items: None }),
]);
let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
registry.register(
ToolDefinition::new("read_pdf")
.description("Extract text content from PDF files. Returns page-by-page text extraction with character counts. Supports page range selection.")
.parameters(schema),
ToolHandler::new(|ctx, args| {
let gctx = GitToolCtx::new(ctx);
Box::pin(async move {
read_pdf_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
})
}),
);
}

View File

@ -0,0 +1,204 @@
//! read_ppt — extract text from PowerPoint files (.pptx).
use crate::file_tools::MAX_FILE_SIZE;
use crate::git_tools::ctx::GitToolCtx;
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
use futures::FutureExt;
use std::collections::HashMap;
use zip::ZipArchive;
async fn read_ppt_exec(
ctx: GitToolCtx,
args: serde_json::Value,
) -> Result<serde_json::Value, String> {
let p: serde_json::Map<String, serde_json::Value> =
serde_json::from_value(args).map_err(|e| e.to_string())?;
let project_name = p
.get("project_name")
.and_then(|v| v.as_str())
.ok_or("missing project_name")?;
let repo_name = p
.get("repo_name")
.and_then(|v| v.as_str())
.ok_or("missing repo_name")?;
let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
let rev = p
.get("rev")
.and_then(|v| v.as_str())
.map(String::from)
.unwrap_or_else(|| "HEAD".to_string());
let slide_start = p.get("slide_start").and_then(|v| v.as_u64()).map(|v| v as usize);
let slide_end = p.get("slide_end").and_then(|v| v.as_u64()).map(|v| v as usize);
let include_notes = p
.get("include_notes")
.and_then(|v| v.as_bool())
.unwrap_or(false);
let domain = ctx.open_repo(project_name, repo_name).await?;
let commit_oid = if rev.len() >= 40 {
git::commit::types::CommitOid::new(&rev)
} else {
domain
.commit_get_prefix(&rev)
.map_err(|e| e.to_string())?
.oid
};
let entry = domain
.tree_entry_by_path_from_commit(&commit_oid, path)
.map_err(|e| e.to_string())?;
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
let data = &content.content;
if data.len() > MAX_FILE_SIZE {
return Err(format!(
"file too large ({} bytes), max {} bytes",
data.len(),
MAX_FILE_SIZE
));
}
let cursor = std::io::Cursor::new(data.clone());
let mut archive =
ZipArchive::new(cursor).map_err(|e| format!("failed to read PPTX ZIP: {}", e))?;
let mut slides: Vec<serde_json::Value> = Vec::new();
// Collect all slide file names
let mut slide_files: Vec<String> = (1..=1000)
.filter_map(|i| {
let name = format!("ppt/slides/slide{}.xml", i);
if archive.by_name(&name).is_ok() {
Some(name)
} else {
None
}
})
.collect();
let total_slides = slide_files.len();
let start = slide_start.unwrap_or(0).min(total_slides.saturating_sub(1));
let end = slide_end.unwrap_or(start + 50).min(total_slides);
for slide_file in slide_files.iter().skip(start).take(end - start) {
let slide_idx = slides.len() + start + 1;
let mut file = archive
.by_name(slide_file)
.map_err(|e| format!("failed to read slide {}: {}", slide_file, e))?;
let mut xml_content = String::new();
use std::io::Read;
file.read_to_string(&mut xml_content)
.map_err(|e| e.to_string())?;
// Extract text from slide XML
let text = extract_text_from_pptx_xml(&xml_content);
// Optionally extract notes
let notes = if include_notes {
let notes_file = format!("ppt/notesSlides/notesSlide{}.xml", slide_idx);
if let Ok(mut notes_file) = archive.by_name(&notes_file) {
let mut notes_xml = String::new();
if notes_file.read_to_string(&mut notes_xml).is_ok() {
Some(extract_text_from_pptx_xml(&notes_xml))
} else {
None
}
} else {
None
}
} else {
None
};
slides.push(serde_json::json!({
"slide": slide_idx,
"text": text.clone(),
"char_count": text.chars().count(),
"notes": notes,
}));
}
Ok(serde_json::json!({
"path": path,
"rev": rev,
"total_slides": total_slides,
"extracted_slides": slides.len(),
"slides": slides,
}))
}
/// Extract text content from PPTX slide XML using simple tag extraction.
fn extract_text_from_pptx_xml(xml: &str) -> String {
// PPTX uses <a:t> tags for text content
let mut results: Vec<&str> = Vec::new();
let mut last_end = 0;
while let Some(start) = xml[last_end..].find("<a:t") {
let abs_start = last_end + start;
if let Some(tag_end) = xml[abs_start..].find('>') {
let content_start = abs_start + tag_end + 1;
if let Some(end_tag) = xml[content_start..].find("</a:t>") {
let text = &xml[content_start..content_start + end_tag];
let trimmed = text.trim();
if !trimmed.is_empty() {
results.push(trimmed);
}
last_end = content_start + end_tag + 7; // len of </a:t>
} else {
break;
}
} else {
break;
}
}
// Also try <w:t> tags (notes slides use Word namespaces)
let mut last_end = 0;
while let Some(start) = xml[last_end..].find("<w:t") {
let abs_start = last_end + start;
if let Some(tag_end) = xml[abs_start..].find('>') {
let content_start = abs_start + tag_end + 1;
if let Some(end_tag) = xml[content_start..].find("</w:t>") {
let text = &xml[content_start..content_start + end_tag];
let trimmed = text.trim();
if !trimmed.is_empty() && !results.contains(&trimmed) {
results.push(trimmed);
}
last_end = content_start + end_tag + 6; // len of </w:t>
} else {
break;
}
} else {
break;
}
}
results.join(" ")
}
pub fn register_ppt_tools(registry: &mut ToolRegistry) {
let p = HashMap::from([
("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the .pptx document".into()), required: true, properties: None, items: None }),
("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }),
("slide_start".into(), ToolParam { name: "slide_start".into(), param_type: "integer".into(), description: Some("1-based starting slide number (default: 1)".into()), required: false, properties: None, items: None }),
("slide_end".into(), ToolParam { name: "slide_end".into(), param_type: "integer".into(), description: Some("1-based ending slide number".into()), required: false, properties: None, items: None }),
("include_notes".into(), ToolParam { name: "include_notes".into(), param_type: "boolean".into(), description: Some("Include speaker notes (default: false)".into()), required: false, properties: None, items: None }),
]);
let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
registry.register(
ToolDefinition::new("read_ppt")
.description("Extract text content from PowerPoint presentations (.pptx). Returns slide-by-slide text with character counts. Supports slide range selection and speaker notes.")
.parameters(schema),
ToolHandler::new(|ctx, args| {
let gctx = GitToolCtx::new(ctx);
Box::pin(async move {
read_ppt_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
})
}),
);
}

View File

@ -0,0 +1,154 @@
//! read_sql — parse and analyze SQL files.
use crate::file_tools::MAX_FILE_SIZE;
use crate::git_tools::ctx::GitToolCtx;
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
use sqlparser::ast::{Statement, ColumnDef};
use sqlparser::dialect::{GenericDialect, MySqlDialect, PostgreSqlDialect, SQLiteDialect};
use sqlparser::parser::Parser;
use std::collections::HashMap;
async fn read_sql_exec(
ctx: GitToolCtx,
args: serde_json::Value,
) -> Result<serde_json::Value, String> {
let p: serde_json::Map<String, serde_json::Value> =
serde_json::from_value(args).map_err(|e| e.to_string())?;
let project_name = p
.get("project_name")
.and_then(|v| v.as_str())
.ok_or("missing project_name")?;
let repo_name = p
.get("repo_name")
.and_then(|v| v.as_str())
.ok_or("missing repo_name")?;
let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
let rev = p
.get("rev")
.and_then(|v| v.as_str())
.map(String::from)
.unwrap_or_else(|| "HEAD".to_string());
let dialect = p.get("dialect").and_then(|v| v.as_str()).unwrap_or("generic");
let domain = ctx.open_repo(project_name, repo_name).await?;
let commit_oid = if rev.len() >= 40 {
git::commit::types::CommitOid::new(&rev)
} else {
domain
.commit_get_prefix(&rev)
.map_err(|e| e.to_string())?
.oid
};
let entry = domain
.tree_entry_by_path_from_commit(&commit_oid, path)
.map_err(|e| e.to_string())?;
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
let data = &content.content;
if data.len() > MAX_FILE_SIZE {
return Err(format!(
"file too large ({} bytes), max {} bytes",
data.len(),
MAX_FILE_SIZE
));
}
let text = String::from_utf8_lossy(data);
let parser_dialect: Box<dyn sqlparser::dialect::Dialect> = match dialect {
"mysql" => Box::new(MySqlDialect {}),
"postgresql" | "postgres" => Box::new(PostgreSqlDialect {}),
"sqlite" => Box::new(SQLiteDialect {}),
_ => Box::new(GenericDialect {}),
};
let statements = Parser::parse_sql(parser_dialect.as_ref(), &text)
.map_err(|e| format!("SQL parse error: {}", e))?;
let mut tables: Vec<serde_json::Value> = Vec::new();
let mut views: Vec<serde_json::Value> = Vec::new();
let mut functions: Vec<serde_json::Value> = Vec::new();
let mut indexes: Vec<serde_json::Value> = Vec::new();
let mut statement_kinds: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
for statement in &statements {
let kind = format!("{:?}", statement).split('{').next().unwrap_or("unknown").to_string();
*statement_kinds.entry(kind).or_insert(0) += 1;
match statement {
Statement::CreateTable(stmt) => {
let name = stmt.name.to_string();
let columns: Vec<String> = stmt.columns.iter().map(format_column_def).collect();
tables.push(serde_json::json!({
"name": name,
"columns": columns,
"if_not_exists": stmt.if_not_exists,
}));
}
Statement::CreateView { name, query, .. } => {
views.push(serde_json::json!({
"name": name.to_string(),
"query": query.to_string(),
}));
}
Statement::CreateIndex(stmt) => {
indexes.push(serde_json::json!({
"name": stmt.name.as_ref().map(|n| n.to_string()).unwrap_or_default(),
"table": stmt.table_name.to_string(),
"columns": stmt.columns.iter().map(|c| c.to_string()).collect::<Vec<_>>(),
}));
}
Statement::CreateFunction(stmt) => {
functions.push(serde_json::json!({
"name": stmt.name.to_string(),
"args": stmt.args.iter().flat_map(|args| args.iter().filter_map(|a| a.name.as_ref().map(|n| n.to_string()))).collect::<Vec<_>>(),
"return_type": stmt.return_type.as_ref().map(|r| r.to_string()).unwrap_or_default(),
}));
}
_ => {}
}
}
Ok(serde_json::json!({
"path": path,
"rev": rev,
"dialect": dialect,
"statement_count": statements.len(),
"statement_kinds": statement_kinds,
"tables": tables,
"views": views,
"functions": functions,
"indexes": indexes,
}))
}
fn format_column_def(col: &ColumnDef) -> String {
let name = col.name.to_string();
let data_type = col.data_type.to_string();
format!("{} {}", name, data_type)
}
pub fn register_sql_tools(registry: &mut ToolRegistry) {
let p = HashMap::from([
("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the SQL file".into()), required: true, properties: None, items: None }),
("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }),
("dialect".into(), ToolParam { name: "dialect".into(), param_type: "string".into(), description: Some("SQL dialect: generic, mysql, postgresql, sqlite. Default: generic".into()), required: false, properties: None, items: None }),
]);
let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
registry.register(
ToolDefinition::new("read_sql")
.description("Parse and analyze a SQL file. Extracts CREATE TABLE statements (with columns and types), CREATE VIEW, CREATE INDEX, CREATE FUNCTION, and counts all statement types.")
.parameters(schema),
ToolHandler::new(|ctx, args| {
let gctx = GitToolCtx::new(ctx);
Box::pin(async move {
read_sql_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
})
}),
);
}

View File

@ -0,0 +1,184 @@
//! read_word — parse and extract text from Word documents (.docx) via zip+xml.
use crate::file_tools::MAX_FILE_SIZE;
use crate::git_tools::ctx::GitToolCtx;
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
use futures::FutureExt;
use quick_xml::events::Event;
use quick_xml::Reader;
use std::collections::HashMap;
use zip::ZipArchive;
async fn read_word_exec(
ctx: GitToolCtx,
args: serde_json::Value,
) -> Result<serde_json::Value, String> {
let p: serde_json::Map<String, serde_json::Value> =
serde_json::from_value(args).map_err(|e| e.to_string())?;
let project_name = p
.get("project_name")
.and_then(|v| v.as_str())
.ok_or("missing project_name")?;
let repo_name = p
.get("repo_name")
.and_then(|v| v.as_str())
.ok_or("missing repo_name")?;
let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
let rev = p
.get("rev")
.and_then(|v| v.as_str())
.map(String::from)
.unwrap_or_else(|| "HEAD".to_string());
let offset = p.get("offset").and_then(|v| v.as_u64()).unwrap_or(0) as usize;
let limit = p
.get("limit")
.and_then(|v| v.as_u64())
.unwrap_or(200) as usize;
let sections_only = p
.get("sections_only")
.and_then(|v| v.as_bool())
.unwrap_or(false);
let domain = ctx.open_repo(project_name, repo_name).await?;
let commit_oid = if rev.len() >= 40 {
git::commit::types::CommitOid::new(&rev)
} else {
domain
.commit_get_prefix(&rev)
.map_err(|e| e.to_string())?
.oid
};
let entry = domain
.tree_entry_by_path_from_commit(&commit_oid, path)
.map_err(|e| e.to_string())?;
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
let data = &content.content;
if data.len() > MAX_FILE_SIZE {
return Err(format!(
"file too large ({} bytes), max {} bytes",
data.len(),
MAX_FILE_SIZE
));
}
// DOCX is a ZIP archive. Read word/document.xml from it.
let cursor = std::io::Cursor::new(data);
let mut archive = ZipArchive::new(cursor).map_err(|e| {
format!(
"failed to open docx as ZIP archive: {}. Make sure the file is a valid .docx document.",
e
)
})?;
let doc_xml = {
let file = if let Ok(f) = archive.by_name("word/document.xml") {
f
} else {
archive.by_name("document.xml")
.map_err(|_| "docx archive does not contain word/document.xml or document.xml")?
};
let mut s = String::new();
let mut reader = std::io::BufReader::new(file);
std::io::Read::read_to_string(&mut reader, &mut s)
.map_err(|e| format!("failed to read document.xml: {}", e))?;
s
};
// Parse paragraphs from <w:p> elements
let mut reader = Reader::from_str(&doc_xml);
reader.config_mut().trim_text(false);
let mut paragraphs: Vec<String> = Vec::new();
let mut buf = Vec::new();
let mut in_paragraph = false;
let mut current_text = String::new();
loop {
match reader.read_event_into(&mut buf) {
Ok(Event::Start(e)) => {
if e.name().as_ref() == b"w:p" {
in_paragraph = true;
current_text.clear();
}
}
Ok(Event::Text(e)) => {
if in_paragraph {
let txt = e.unescape().map(|s| s.into_owned()).unwrap_or_default();
current_text.push_str(&txt);
}
}
Ok(Event::End(e)) => {
if e.name().as_ref() == b"w:p" && in_paragraph {
in_paragraph = false;
let text = current_text.trim().to_string();
if !text.is_empty() {
paragraphs.push(text);
}
}
}
Ok(Event::Eof) => break,
_ => {}
}
buf.clear();
}
let total = paragraphs.len();
let body: Vec<serde_json::Value> = if sections_only {
paragraphs
.iter()
.enumerate()
.filter(|(_, text)| {
text.chars().next().map(|c| c.is_uppercase()).unwrap_or(false)
&& text.chars().filter(|&c| c == ' ').count() < text.len() / 2
&& text.len() < 200
})
.skip(offset)
.take(limit)
.map(|(i, t)| serde_json::json!({ "index": i, "text": t }))
.collect()
} else {
paragraphs
.iter()
.skip(offset)
.take(limit)
.enumerate()
.map(|(i, t)| serde_json::json!({ "index": offset + i, "text": t }))
.collect()
};
Ok(serde_json::json!({
"path": path,
"rev": rev,
"paragraph_count": total,
"paragraphs": body,
}))
}
pub fn register_word_tools(registry: &mut ToolRegistry) {
let p = HashMap::from([
("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the .docx document".into()), required: true, properties: None, items: None }),
("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }),
("sections_only".into(), ToolParam { name: "sections_only".into(), param_type: "boolean".into(), description: Some("If true, extract only section/heading-like paragraphs (short lines starting with uppercase)".into()), required: false, properties: None, items: None }),
("offset".into(), ToolParam { name: "offset".into(), param_type: "integer".into(), description: Some("Number of paragraphs to skip (default: 0)".into()), required: false, properties: None, items: None }),
("limit".into(), ToolParam { name: "limit".into(), param_type: "integer".into(), description: Some("Maximum paragraphs to return (default: 200)".into()), required: false, properties: None, items: None }),
]);
let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
registry.register(
ToolDefinition::new("read_word")
.description("Parse and extract text from Word documents (.docx). Returns paragraphs with index and text content. Supports pagination.")
.parameters(schema),
ToolHandler::new(|ctx, args| {
let gctx = GitToolCtx::new(ctx);
Box::pin(async move {
read_word_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
})
}),
);
}

View File

@ -91,6 +91,38 @@ async fn git_file_history_exec(ctx: GitToolCtx, args: serde_json::Value) -> Resu
Ok(serde_json::to_value(result).map_err(|e| e.to_string())?) Ok(serde_json::to_value(result).map_err(|e| e.to_string())?)
} }
async fn git_blob_get_exec(ctx: GitToolCtx, args: serde_json::Value) -> Result<serde_json::Value, String> {
let p: serde_json::Map<String, serde_json::Value> = serde_json::from_value(args).map_err(|e| e.to_string())?;
let project_name = p.get("project_name").and_then(|v| v.as_str()).ok_or("missing project_name")?;
let repo_name = p.get("repo_name").and_then(|v| v.as_str()).ok_or("missing repo_name")?;
let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
let rev = p.get("rev").and_then(|v| v.as_str()).map(String::from).unwrap_or_else(|| "HEAD".to_string());
let domain = ctx.open_repo(project_name, repo_name).await?;
let oid = if rev.len() >= 40 {
git::commit::types::CommitOid::new(&rev)
} else {
domain.commit_get_prefix(&rev).map_err(|e| e.to_string())?.oid
};
let entry = domain.tree_entry_by_path_from_commit(&oid, path).map_err(|e| e.to_string())?;
let blob_info = domain.blob_get(&entry.oid).map_err(|e| e.to_string())?;
if blob_info.is_binary {
return Err(format!("file '{}' is binary, cannot return as text", path));
}
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
let text = String::from_utf8_lossy(&content.content).to_string();
Ok(serde_json::json!({
"path": path,
"oid": entry.oid.to_string(),
"size": blob_info.size,
"content": text,
}))
}
fn flatten_commit(c: &git::commit::types::CommitMeta) -> serde_json::Value { fn flatten_commit(c: &git::commit::types::CommitMeta) -> serde_json::Value {
use chrono::TimeZone; use chrono::TimeZone;
let ts = c.author.time_secs + (c.author.offset_minutes as i64 * 60); let ts = c.author.time_secs + (c.author.offset_minutes as i64 * 60);
@ -162,4 +194,22 @@ pub fn register_git_tools(registry: &mut ToolRegistry) {
}) })
}), }),
); );
// git_blob_get
let p = HashMap::from([
("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path within the repository".into()), required: true, properties: None, items: None }),
("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Revision to read file from (default: HEAD)".into()), required: false, properties: None, items: None }),
]);
let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
registry.register(
ToolDefinition::new("git_blob_get").description("Retrieve the raw content of a single file (blob) at a given revision. Returns error if the file is binary.").parameters(schema),
ToolHandler::new(|ctx, args| {
let gctx = super::ctx::GitToolCtx::new(ctx);
Box::pin(async move {
git_blob_get_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
})
}),
);
} }

View File

@ -148,6 +148,7 @@ impl AppService {
let client = async_openai::Client::with_config(cfg); let client = async_openai::Client::with_config(cfg);
let mut registry = ToolRegistry::new(); let mut registry = ToolRegistry::new();
git_tools::register_all(&mut registry); git_tools::register_all(&mut registry);
file_tools::register_all(&mut registry);
Some(Arc::new(ChatService::new(client).with_tool_registry(registry))) Some(Arc::new(ChatService::new(client).with_tool_registry(registry)))
} }
(Err(e), _) => { (Err(e), _) => {
@ -229,6 +230,7 @@ pub mod auth;
pub mod error; pub mod error;
pub mod git; pub mod git;
pub mod git_tools; pub mod git_tools;
pub mod file_tools;
pub mod issue; pub mod issue;
pub mod project; pub mod project;
pub mod pull_request; pub mod pull_request;