feat(service): add file_tools module and git_blob_get tool
Add AI-accessible tools for reading structured files (CSV, JSON/JSONC, Markdown, SQL) and searching repository content (git_grep). Also adds git_blob_get to retrieve raw blob text content with binary detection. Deferred: Excel, Word, PDF, PPT modules are stubbed out due to library API incompatibilities (calamine 0.26, lopdf 0.34, quick-xml 0.37).
This commit is contained in:
parent
767bb10249
commit
1af796ac75
248
Cargo.lock
generated
248
Cargo.lock
generated
@ -614,11 +614,23 @@ dependencies = [
|
||||
"num-traits",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ar_archive_writer"
|
||||
version = "0.5.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7eb93bbb63b9c227414f6eb3a0adfddca591a8ce1e9b60661bb08969b87e340b"
|
||||
dependencies = [
|
||||
"object",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "arbitrary"
|
||||
version = "1.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1"
|
||||
dependencies = [
|
||||
"derive_arbitrary",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "arc-swap"
|
||||
@ -1357,6 +1369,21 @@ dependencies = [
|
||||
"libbz2-rs-sys",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "calamine"
|
||||
version = "0.26.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "138646b9af2c5d7f1804ea4bf93afc597737d2bd4f7341d67c48b03316976eb1"
|
||||
dependencies = [
|
||||
"byteorder",
|
||||
"codepage",
|
||||
"encoding_rs",
|
||||
"log",
|
||||
"quick-xml 0.31.0",
|
||||
"serde",
|
||||
"zip 2.4.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "captcha-rs"
|
||||
version = "0.5.0"
|
||||
@ -1509,6 +1536,15 @@ version = "0.5.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "de0758edba32d61d1fd9f4d69491b47604b91ee2f7e6b33de7e54ca4ebe55dc3"
|
||||
|
||||
[[package]]
|
||||
name = "codepage"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "48f68d061bc2828ae826206326e61251aca94c1e4a5305cf52d9138639c918b4"
|
||||
dependencies = [
|
||||
"encoding_rs",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "color_quant"
|
||||
version = "1.1.0"
|
||||
@ -1799,6 +1835,27 @@ dependencies = [
|
||||
"hybrid-array",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "csv"
|
||||
version = "1.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938"
|
||||
dependencies = [
|
||||
"csv-core",
|
||||
"itoa",
|
||||
"ryu",
|
||||
"serde_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "csv-core"
|
||||
version = "0.1.13"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ctr"
|
||||
version = "0.9.2"
|
||||
@ -1989,6 +2046,17 @@ dependencies = [
|
||||
"serde_core",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_arbitrary"
|
||||
version = "1.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a"
|
||||
dependencies = [
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "derive_builder"
|
||||
version = "0.20.2"
|
||||
@ -2676,6 +2744,15 @@ dependencies = [
|
||||
"zeroize",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getopts"
|
||||
version = "0.2.24"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df"
|
||||
dependencies = [
|
||||
"unicode-width",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "getrandom"
|
||||
version = "0.2.17"
|
||||
@ -2786,7 +2863,7 @@ dependencies = [
|
||||
"tokio",
|
||||
"tokio-util",
|
||||
"uuid",
|
||||
"zip",
|
||||
"zip 8.4.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -4203,6 +4280,26 @@ dependencies = [
|
||||
"imgref",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lopdf"
|
||||
version = "0.34.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c5c8ecfc6c72051981c0459f75ccc585e7ff67c70829560cda8e647882a9abff"
|
||||
dependencies = [
|
||||
"chrono",
|
||||
"encoding_rs",
|
||||
"flate2",
|
||||
"indexmap 2.13.0",
|
||||
"itoa",
|
||||
"log",
|
||||
"md-5",
|
||||
"nom 7.1.3",
|
||||
"rangemap",
|
||||
"rayon",
|
||||
"time",
|
||||
"weezl",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "lru"
|
||||
version = "0.12.5"
|
||||
@ -4698,6 +4795,15 @@ dependencies = [
|
||||
"objc2-core-foundation",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "object"
|
||||
version = "0.37.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "once_cell"
|
||||
version = "1.21.4"
|
||||
@ -5424,6 +5530,16 @@ dependencies = [
|
||||
"prost",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "psm"
|
||||
version = "0.1.30"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "3852766467df634d74f0b2d7819bf8dc483a0eb2e3b0f50f756f9cfe8b0d18d8"
|
||||
dependencies = [
|
||||
"ar_archive_writer",
|
||||
"cc",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "ptr_meta"
|
||||
version = "0.1.4"
|
||||
@ -5444,6 +5560,25 @@ dependencies = [
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pulldown-cmark"
|
||||
version = "0.12.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f86ba2052aebccc42cbbb3ed234b8b13ce76f75c3551a303cb2bcffcff12bb14"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"getopts",
|
||||
"memchr",
|
||||
"pulldown-cmark-escape",
|
||||
"unicase",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "pulldown-cmark-escape"
|
||||
version = "0.11.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "007d8adb5ddab6f8e3f491ac63566a7d5002cc7ed73901f72057943fa71ae1ae"
|
||||
|
||||
[[package]]
|
||||
name = "pxfm"
|
||||
version = "0.1.28"
|
||||
@ -5505,6 +5640,25 @@ version = "2.0.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3"
|
||||
|
||||
[[package]]
|
||||
name = "quick-xml"
|
||||
version = "0.31.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33"
|
||||
dependencies = [
|
||||
"encoding_rs",
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quick-xml"
|
||||
version = "0.37.5"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "quinn"
|
||||
version = "0.11.9"
|
||||
@ -5679,6 +5833,12 @@ dependencies = [
|
||||
"rand 0.9.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "rangemap"
|
||||
version = "1.7.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "973443cf09a9c8656b574a866ab68dfa19f0867d0340648c7d2f6a71b8a8ea68"
|
||||
|
||||
[[package]]
|
||||
name = "rav1e"
|
||||
version = "0.8.1"
|
||||
@ -5755,6 +5915,26 @@ dependencies = [
|
||||
"crossbeam-utils",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "recursive"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e"
|
||||
dependencies = [
|
||||
"recursive-proc-macro-impl",
|
||||
"stacker",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "recursive-proc-macro-impl"
|
||||
version = "0.1.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b"
|
||||
dependencies = [
|
||||
"quote",
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "redis"
|
||||
version = "1.1.0"
|
||||
@ -6745,22 +6925,29 @@ dependencies = [
|
||||
"async-openai",
|
||||
"avatar",
|
||||
"base64 0.22.1",
|
||||
"calamine",
|
||||
"captcha-rs",
|
||||
"chrono",
|
||||
"config",
|
||||
"csv",
|
||||
"db",
|
||||
"deadpool-redis",
|
||||
"email",
|
||||
"flate2",
|
||||
"futures",
|
||||
"git",
|
||||
"git2",
|
||||
"hex",
|
||||
"hmac",
|
||||
"lopdf",
|
||||
"models",
|
||||
"moka",
|
||||
"pulldown-cmark",
|
||||
"queue",
|
||||
"quick-xml 0.37.5",
|
||||
"rand 0.10.0",
|
||||
"redis",
|
||||
"regex",
|
||||
"reqwest 0.13.2",
|
||||
"room",
|
||||
"rsa",
|
||||
@ -6772,11 +6959,15 @@ dependencies = [
|
||||
"sha1",
|
||||
"sha2 0.11.0",
|
||||
"slog",
|
||||
"sqlparser",
|
||||
"tempfile",
|
||||
"tokio",
|
||||
"tokio-stream",
|
||||
"tracing",
|
||||
"utoipa",
|
||||
"uuid",
|
||||
"walkdir",
|
||||
"zip 8.4.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
@ -6992,6 +7183,16 @@ dependencies = [
|
||||
"der",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sqlparser"
|
||||
version = "0.55.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c4521174166bac1ff04fe16ef4524c70144cd29682a45978978ca3d7f4e0be11"
|
||||
dependencies = [
|
||||
"log",
|
||||
"recursive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "sqlx"
|
||||
version = "0.8.6"
|
||||
@ -7275,6 +7476,19 @@ version = "1.2.1"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596"
|
||||
|
||||
[[package]]
|
||||
name = "stacker"
|
||||
version = "0.1.23"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "08d74a23609d509411d10e2176dc2a4346e3b4aea2e7b1869f19fdedbc71c013"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"cfg-if",
|
||||
"libc",
|
||||
"psm",
|
||||
"windows-sys 0.59.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "static-server"
|
||||
version = "0.2.9"
|
||||
@ -7936,6 +8150,12 @@ version = "1.13.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-width"
|
||||
version = "0.2.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254"
|
||||
|
||||
[[package]]
|
||||
name = "unicode-xid"
|
||||
version = "0.2.6"
|
||||
@ -8504,6 +8724,15 @@ dependencies = [
|
||||
"windows-targets 0.52.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.59.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
|
||||
dependencies = [
|
||||
"windows-targets 0.52.6",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "windows-sys"
|
||||
version = "0.60.2"
|
||||
@ -8974,6 +9203,23 @@ dependencies = [
|
||||
"syn 2.0.117",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zip"
|
||||
version = "2.4.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50"
|
||||
dependencies = [
|
||||
"arbitrary",
|
||||
"crc32fast",
|
||||
"crossbeam-utils",
|
||||
"displaydoc",
|
||||
"flate2",
|
||||
"indexmap 2.13.0",
|
||||
"memchr",
|
||||
"thiserror 2.0.18",
|
||||
"zopfli",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "zip"
|
||||
version = "8.4.0"
|
||||
|
||||
10
Cargo.toml
10
Cargo.toml
@ -142,6 +142,12 @@ hostname = "0.4"
|
||||
utoipa = { version = "5.4.0", features = ["chrono", "uuid"] }
|
||||
rust_decimal = "1.40.0"
|
||||
walkdir = "2.5.0"
|
||||
calamine = "0.26"
|
||||
csv = "1.3"
|
||||
lopdf = "0.34"
|
||||
pulldown-cmark = "0.12"
|
||||
quick-xml = "0.37"
|
||||
sqlparser = "0.55"
|
||||
lazy_static = "1.5"
|
||||
moka = "0.12.15"
|
||||
serde = "1.0.228"
|
||||
@ -151,9 +157,7 @@ serde_bytes = "0.11.19"
|
||||
phf = "0.13.1"
|
||||
phf_codegen = "0.13.1"
|
||||
base64 = "0.22.1"
|
||||
|
||||
|
||||
|
||||
tempfile = "3"
|
||||
|
||||
[workspace.package]
|
||||
version = "0.2.9"
|
||||
|
||||
@ -54,6 +54,17 @@ futures = { workspace = true }
|
||||
deadpool-redis = { workspace = true, features = ["rt_tokio_1", "cluster-async", "cluster"] }
|
||||
moka = { workspace = true, features = ["future"] }
|
||||
rust_decimal = { workspace = true }
|
||||
calamine = { workspace = true }
|
||||
csv = { workspace = true }
|
||||
quick-xml = { workspace = true }
|
||||
lopdf = { workspace = true }
|
||||
pulldown-cmark = { workspace = true }
|
||||
sqlparser = { workspace = true }
|
||||
walkdir = { workspace = true }
|
||||
zip = { workspace = true }
|
||||
regex = { workspace = true }
|
||||
flate2 = { workspace = true }
|
||||
tempfile = { workspace = true }
|
||||
|
||||
[lints]
|
||||
workspace = true
|
||||
|
||||
325
libs/service/file_tools/csv.rs
Normal file
325
libs/service/file_tools/csv.rs
Normal file
@ -0,0 +1,325 @@
|
||||
//! read_csv — parse and query CSV files.
|
||||
|
||||
use crate::file_tools::MAX_FILE_SIZE;
|
||||
use crate::git_tools::ctx::GitToolCtx;
|
||||
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
|
||||
use csv::ReaderBuilder;
|
||||
use std::collections::HashMap;
|
||||
|
||||
async fn read_csv_exec(
|
||||
ctx: GitToolCtx,
|
||||
args: serde_json::Value,
|
||||
) -> Result<serde_json::Value, String> {
|
||||
let p: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_value(args).map_err(|e| e.to_string())?;
|
||||
|
||||
let project_name = p
|
||||
.get("project_name")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing project_name")?;
|
||||
let repo_name = p
|
||||
.get("repo_name")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing repo_name")?;
|
||||
let path = p
|
||||
.get("path")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing path")?;
|
||||
let rev = p
|
||||
.get("rev")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(String::from)
|
||||
.unwrap_or_else(|| "HEAD".to_string());
|
||||
let delimiter = p
|
||||
.get("delimiter")
|
||||
.and_then(|v| v.as_str())
|
||||
.and_then(|s| s.chars().next())
|
||||
.unwrap_or(',');
|
||||
let has_header = p
|
||||
.get("has_header")
|
||||
.and_then(|v| v.as_bool())
|
||||
.unwrap_or(true);
|
||||
let offset = p.get("offset").and_then(|v| v.as_u64()).unwrap_or(0) as usize;
|
||||
let limit = p.get("limit").and_then(|v| v.as_u64()).unwrap_or(100) as usize;
|
||||
let filter_col = p.get("filter_column").and_then(|v| v.as_str());
|
||||
let filter_val = p.get("filter_value").and_then(|v| v.as_str());
|
||||
let select_cols = p.get("columns").and_then(|v| v.as_array()).map(|a| {
|
||||
a.iter()
|
||||
.filter_map(|v| v.as_str().map(String::from))
|
||||
.collect::<Vec<_>>()
|
||||
});
|
||||
|
||||
let domain = ctx.open_repo(project_name, repo_name).await?;
|
||||
|
||||
let commit_oid = if rev.len() >= 40 {
|
||||
git::commit::types::CommitOid::new(&rev)
|
||||
} else {
|
||||
domain
|
||||
.commit_get_prefix(&rev)
|
||||
.map_err(|e| e.to_string())?
|
||||
.oid
|
||||
};
|
||||
|
||||
let entry = domain
|
||||
.tree_entry_by_path_from_commit(&commit_oid, path)
|
||||
.map_err(|e| e.to_string())?;
|
||||
let blob = domain.blob_get(&entry.oid).map_err(|e| e.to_string())?;
|
||||
|
||||
if blob.is_binary {
|
||||
return Err("file is binary, not a CSV".to_string());
|
||||
}
|
||||
|
||||
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
|
||||
let data = &content.content;
|
||||
if data.len() > MAX_FILE_SIZE {
|
||||
return Err(format!(
|
||||
"file too large ({} bytes), max {} bytes",
|
||||
data.len(),
|
||||
MAX_FILE_SIZE
|
||||
));
|
||||
}
|
||||
|
||||
let text = String::from_utf8_lossy(data);
|
||||
let mut reader = ReaderBuilder::new()
|
||||
.delimiter(delimiter as u8)
|
||||
.has_headers(has_header)
|
||||
.from_reader(text.as_bytes());
|
||||
|
||||
let headers: Vec<String> = if has_header {
|
||||
reader
|
||||
.headers()
|
||||
.map_err(|e| e.to_string())?
|
||||
.clone()
|
||||
.into_iter()
|
||||
.map(String::from)
|
||||
.collect()
|
||||
} else {
|
||||
vec![]
|
||||
};
|
||||
|
||||
let col_indices: Vec<usize> = if let Some(ref sel) = select_cols {
|
||||
sel.iter()
|
||||
.filter_map(|col| headers.iter().position(|h| h == col))
|
||||
.collect()
|
||||
} else {
|
||||
(0..headers.len()).collect()
|
||||
};
|
||||
|
||||
let _col_set: std::collections::HashSet<usize> = col_indices.iter().cloned().collect();
|
||||
let filter_col_idx = filter_col.and_then(|c| headers.iter().position(|h| h == c));
|
||||
|
||||
let mut rows: Vec<serde_json::Value> = Vec::new();
|
||||
let mut skipped = 0;
|
||||
let mut total = 0;
|
||||
|
||||
for result in reader.records() {
|
||||
let record = result.map_err(|e| e.to_string())?;
|
||||
|
||||
// Skip offset
|
||||
if skipped < offset {
|
||||
skipped += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
total += 1;
|
||||
|
||||
// Filter
|
||||
if let (Some(fci), Some(fv)) = (filter_col_idx, filter_val) {
|
||||
if record.get(fci) != Some(fv) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Select columns
|
||||
let obj = if has_header {
|
||||
let mut map = serde_json::Map::new();
|
||||
for &idx in &col_indices {
|
||||
let key = headers
|
||||
.get(idx)
|
||||
.cloned()
|
||||
.unwrap_or_else(|| format!("col_{}", idx));
|
||||
let val = record.get(idx).unwrap_or("").to_string();
|
||||
map.insert(key, serde_json::Value::String(val));
|
||||
}
|
||||
serde_json::Value::Object(map)
|
||||
} else {
|
||||
let arr: Vec<String> = col_indices
|
||||
.iter()
|
||||
.map(|&idx| record.get(idx).unwrap_or("").to_string())
|
||||
.collect();
|
||||
serde_json::Value::Array(arr.into_iter().map(serde_json::Value::String).collect())
|
||||
};
|
||||
|
||||
rows.push(obj);
|
||||
|
||||
if rows.len() >= limit {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
Ok(serde_json::json!({
|
||||
"path": path,
|
||||
"rev": rev,
|
||||
"headers": if has_header { headers } else { vec![] },
|
||||
"selected_columns": select_cols,
|
||||
"rows": rows,
|
||||
"row_count": rows.len(),
|
||||
"total_available": total + offset,
|
||||
"filter": if let (Some(c), Some(v)) = (filter_col, filter_val) {
|
||||
serde_json::json!({ "column": c, "value": v })
|
||||
} else { serde_json::Value::Null },
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn register_csv_tools(registry: &mut ToolRegistry) {
|
||||
let p = HashMap::from([
|
||||
(
|
||||
"project_name".into(),
|
||||
ToolParam {
|
||||
name: "project_name".into(),
|
||||
param_type: "string".into(),
|
||||
description: Some("Project name (slug)".into()),
|
||||
required: true,
|
||||
properties: None,
|
||||
items: None,
|
||||
},
|
||||
),
|
||||
(
|
||||
"repo_name".into(),
|
||||
ToolParam {
|
||||
name: "repo_name".into(),
|
||||
param_type: "string".into(),
|
||||
description: Some("Repository name".into()),
|
||||
required: true,
|
||||
properties: None,
|
||||
items: None,
|
||||
},
|
||||
),
|
||||
(
|
||||
"path".into(),
|
||||
ToolParam {
|
||||
name: "path".into(),
|
||||
param_type: "string".into(),
|
||||
description: Some("File path within the repository".into()),
|
||||
required: true,
|
||||
properties: None,
|
||||
items: None,
|
||||
},
|
||||
),
|
||||
(
|
||||
"rev".into(),
|
||||
ToolParam {
|
||||
name: "rev".into(),
|
||||
param_type: "string".into(),
|
||||
description: Some("Git revision (default: HEAD)".into()),
|
||||
required: false,
|
||||
properties: None,
|
||||
items: None,
|
||||
},
|
||||
),
|
||||
(
|
||||
"delimiter".into(),
|
||||
ToolParam {
|
||||
name: "delimiter".into(),
|
||||
param_type: "string".into(),
|
||||
description: Some("Field delimiter character (default: comma \",\")".into()),
|
||||
required: false,
|
||||
properties: None,
|
||||
items: None,
|
||||
},
|
||||
),
|
||||
(
|
||||
"has_header".into(),
|
||||
ToolParam {
|
||||
name: "has_header".into(),
|
||||
param_type: "boolean".into(),
|
||||
description: Some("If true, first row is column headers (default: true)".into()),
|
||||
required: false,
|
||||
properties: None,
|
||||
items: None,
|
||||
},
|
||||
),
|
||||
(
|
||||
"columns".into(),
|
||||
ToolParam {
|
||||
name: "columns".into(),
|
||||
param_type: "array".into(),
|
||||
description: Some("List of column names to select".into()),
|
||||
required: false,
|
||||
properties: None,
|
||||
items: Some(Box::new(ToolParam {
|
||||
name: "".into(),
|
||||
param_type: "string".into(),
|
||||
description: None,
|
||||
required: false,
|
||||
properties: None,
|
||||
items: None,
|
||||
})),
|
||||
},
|
||||
),
|
||||
(
|
||||
"filter_column".into(),
|
||||
ToolParam {
|
||||
name: "filter_column".into(),
|
||||
param_type: "string".into(),
|
||||
description: Some("Column name to filter by".into()),
|
||||
required: false,
|
||||
properties: None,
|
||||
items: None,
|
||||
},
|
||||
),
|
||||
(
|
||||
"filter_value".into(),
|
||||
ToolParam {
|
||||
name: "filter_value".into(),
|
||||
param_type: "string".into(),
|
||||
description: Some("Value to match in filter_column".into()),
|
||||
required: false,
|
||||
properties: None,
|
||||
items: None,
|
||||
},
|
||||
),
|
||||
(
|
||||
"offset".into(),
|
||||
ToolParam {
|
||||
name: "offset".into(),
|
||||
param_type: "integer".into(),
|
||||
description: Some("Number of rows to skip (default: 0)".into()),
|
||||
required: false,
|
||||
properties: None,
|
||||
items: None,
|
||||
},
|
||||
),
|
||||
(
|
||||
"limit".into(),
|
||||
ToolParam {
|
||||
name: "limit".into(),
|
||||
param_type: "integer".into(),
|
||||
description: Some("Maximum rows to return (default: 100)".into()),
|
||||
required: false,
|
||||
properties: None,
|
||||
items: None,
|
||||
},
|
||||
),
|
||||
]);
|
||||
let schema = ToolSchema {
|
||||
schema_type: "object".into(),
|
||||
properties: Some(p),
|
||||
required: Some(vec![
|
||||
"project_name".into(),
|
||||
"repo_name".into(),
|
||||
"path".into(),
|
||||
]),
|
||||
};
|
||||
registry.register(
|
||||
ToolDefinition::new("read_csv")
|
||||
.description("Parse and query a CSV file. Supports header detection, column selection, filtering, pagination (offset/limit), and custom delimiters.")
|
||||
.parameters(schema),
|
||||
ToolHandler::new(|ctx, args| {
|
||||
let gctx = GitToolCtx::new(ctx);
|
||||
Box::pin(async move {
|
||||
read_csv_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
|
||||
})
|
||||
}),
|
||||
);
|
||||
}
|
||||
184
libs/service/file_tools/excel.rs
Normal file
184
libs/service/file_tools/excel.rs
Normal file
@ -0,0 +1,184 @@
|
||||
//! read_excel — parse and query Excel files (.xlsx, .xls).
|
||||
|
||||
use crate::file_tools::MAX_FILE_SIZE;
|
||||
use crate::git_tools::ctx::GitToolCtx;
|
||||
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
|
||||
use calamine::{open_workbook, Reader, Xlsx};
|
||||
use futures::FutureExt;
|
||||
use std::collections::HashMap;
|
||||
|
||||
async fn read_excel_exec(
|
||||
ctx: GitToolCtx,
|
||||
args: serde_json::Value,
|
||||
) -> Result<serde_json::Value, String> {
|
||||
let p: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_value(args).map_err(|e| e.to_string())?;
|
||||
|
||||
let project_name = p
|
||||
.get("project_name")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing project_name")?;
|
||||
let repo_name = p
|
||||
.get("repo_name")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing repo_name")?;
|
||||
let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
|
||||
let rev = p
|
||||
.get("rev")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(String::from)
|
||||
.unwrap_or_else(|| "HEAD".to_string());
|
||||
let sheet_name = p.get("sheet_name").and_then(|v| v.as_str()).map(String::from);
|
||||
let sheet_index = p.get("sheet_index").and_then(|v| v.as_u64()).map(|v| v as usize);
|
||||
let offset = p.get("offset").and_then(|v| v.as_u64()).unwrap_or(0) as usize;
|
||||
let limit = p
|
||||
.get("limit")
|
||||
.and_then(|v| v.as_u64())
|
||||
.unwrap_or(100) as usize;
|
||||
let has_header = p
|
||||
.get("has_header")
|
||||
.and_then(|v| v.as_bool())
|
||||
.unwrap_or(true);
|
||||
|
||||
let domain = ctx.open_repo(project_name, repo_name).await?;
|
||||
|
||||
let commit_oid = if rev.len() >= 40 {
|
||||
git::commit::types::CommitOid::new(&rev)
|
||||
} else {
|
||||
domain
|
||||
.commit_get_prefix(&rev)
|
||||
.map_err(|e| e.to_string())?
|
||||
.oid
|
||||
};
|
||||
|
||||
let entry = domain
|
||||
.tree_entry_by_path_from_commit(&commit_oid, path)
|
||||
.map_err(|e| e.to_string())?;
|
||||
let blob = domain.blob_get(&entry.oid).map_err(|e| e.to_string())?;
|
||||
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
|
||||
|
||||
let data = &content.content;
|
||||
if data.len() > MAX_FILE_SIZE {
|
||||
return Err(format!(
|
||||
"file too large ({} bytes), max {} bytes",
|
||||
data.len(),
|
||||
MAX_FILE_SIZE
|
||||
));
|
||||
}
|
||||
|
||||
// Use cursor-based reading to avoid tempfile
|
||||
let cursor = std::io::Cursor::new(data.clone());
|
||||
let mut workbook: Xlsx<std::io::Cursor<Vec<u8>>> =
|
||||
open_workbook(cursor).map_err(|e| format!("failed to open Excel: {}", e))?;
|
||||
|
||||
let sheet_names = workbook.sheet_names().to_vec();
|
||||
|
||||
// Determine which sheet to read
|
||||
let sheet_idx = match (sheet_name.clone(), sheet_index) {
|
||||
(Some(name), _) => sheet_names
|
||||
.iter()
|
||||
.position(|n| n == &name)
|
||||
.ok_or_else(|| format!("sheet '{}' not found. Available: {:?}", name, sheet_names))?,
|
||||
(_, Some(idx)) => {
|
||||
if idx >= sheet_names.len() {
|
||||
return Err(format!(
|
||||
"sheet index {} out of range (0..{})",
|
||||
idx,
|
||||
sheet_names.len()
|
||||
));
|
||||
}
|
||||
idx
|
||||
}
|
||||
_ => 0,
|
||||
};
|
||||
|
||||
let range = workbook
|
||||
.worksheet_range_at(sheet_idx)
|
||||
.map_err(|e| format!("failed to read sheet: {}", e))?;
|
||||
|
||||
let rows: Vec<Vec<serde_json::Value>> = range
|
||||
.rows()
|
||||
.skip(if has_header { offset + 1 } else { offset })
|
||||
.take(limit)
|
||||
.map(|row| {
|
||||
row.iter()
|
||||
.map(|cell| {
|
||||
use calamine::Data;
|
||||
match cell {
|
||||
Data::Int(i) => serde_json::Value::Number((*i).into()),
|
||||
Data::Float(f) => {
|
||||
serde_json::json!(f)
|
||||
}
|
||||
Data::String(s) => serde_json::Value::String(s.clone()),
|
||||
Data::Bool(b) => serde_json::Value::Bool(*b),
|
||||
Data::DateTime(dt) => {
|
||||
serde_json::Value::String(format!("{:?}", dt))
|
||||
}
|
||||
Data::DateTimeIso(s) => serde_json::Value::String(s.clone()),
|
||||
Data::DurationIso(s) => serde_json::Value::String(s.clone()),
|
||||
Data::Error(e) => serde_json::json!({ "error": format!("{:?}", e) }),
|
||||
Data::Empty => serde_json::Value::Null,
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
.collect();
|
||||
|
||||
let header_row: Vec<String> = if has_header {
|
||||
range
|
||||
.rows()
|
||||
.next()
|
||||
.map(|row| {
|
||||
row.iter()
|
||||
.map(|c| {
|
||||
if let calamine::Data::String(s) = c {
|
||||
s.clone()
|
||||
} else {
|
||||
String::new()
|
||||
}
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or_default()
|
||||
} else {
|
||||
vec![]
|
||||
};
|
||||
|
||||
Ok(serde_json::json!({
|
||||
"path": path,
|
||||
"rev": rev,
|
||||
"sheets": sheet_names,
|
||||
"active_sheet": sheet_names.get(sheet_idx).cloned(),
|
||||
"sheet_index": sheet_idx,
|
||||
"headers": header_row,
|
||||
"rows": rows,
|
||||
"row_count": rows.len(),
|
||||
"total_rows": range.rows().count().saturating_sub(if has_header { 1 } else { 0 }),
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn register_excel_tools(registry: &mut ToolRegistry) {
|
||||
let p = HashMap::from([
|
||||
("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
|
||||
("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
|
||||
("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path within the repository (supports .xlsx, .xls)".into()), required: true, properties: None, items: None }),
|
||||
("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }),
|
||||
("sheet_name".into(), ToolParam { name: "sheet_name".into(), param_type: "string".into(), description: Some("Sheet name to read. Defaults to first sheet.".into()), required: false, properties: None, items: None }),
|
||||
("sheet_index".into(), ToolParam { name: "sheet_index".into(), param_type: "integer".into(), description: Some("Sheet index (0-based). Ignored if sheet_name is set.".into()), required: false, properties: None, items: None }),
|
||||
("has_header".into(), ToolParam { name: "has_header".into(), param_type: "boolean".into(), description: Some("If true, first row is column headers (default: true)".into()), required: false, properties: None, items: None }),
|
||||
("offset".into(), ToolParam { name: "offset".into(), param_type: "integer".into(), description: Some("Number of rows to skip (default: 0)".into()), required: false, properties: None, items: None }),
|
||||
("limit".into(), ToolParam { name: "limit".into(), param_type: "integer".into(), description: Some("Maximum rows to return (default: 100)".into()), required: false, properties: None, items: None }),
|
||||
]);
|
||||
let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
|
||||
registry.register(
|
||||
ToolDefinition::new("read_excel")
|
||||
.description("Parse and query Excel spreadsheets (.xlsx, .xls). Returns sheet names, headers, and rows with support for sheet selection and pagination.")
|
||||
.parameters(schema),
|
||||
ToolHandler::new(|ctx, args| {
|
||||
let gctx = GitToolCtx::new(ctx);
|
||||
Box::pin(async move {
|
||||
read_excel_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
|
||||
})
|
||||
}),
|
||||
);
|
||||
}
|
||||
341
libs/service/file_tools/grep.rs
Normal file
341
libs/service/file_tools/grep.rs
Normal file
@ -0,0 +1,341 @@
|
||||
//! git_grep — search repository files for patterns.
|
||||
|
||||
use crate::file_tools::MAX_FILE_SIZE;
|
||||
use crate::git_tools::ctx::GitToolCtx;
|
||||
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
|
||||
use regex::RegexBuilder;
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Text file extensions to search (skip binary files).
|
||||
const TEXT_EXTS: &[&str] = &[
|
||||
"rs", "toml", "yaml", "yml", "json", "jsonc", "js", "jsx", "ts", "tsx",
|
||||
"css", "scss", "less", "html", "htm", "xml", "svg", "vue", "svelte",
|
||||
"py", "rb", "go", "java", "kt", "swift", "c", "cpp", "h", "hpp",
|
||||
"cs", "php", "pl", "sh", "bash", "zsh", "fish", "ps1", "bat", "cmd",
|
||||
"sql", "md", "markdown", "rst", "txt", "log", "ini", "cfg", "conf",
|
||||
"dockerfile", "makefile", "cmake", "gradle", "properties", "env",
|
||||
"proto", "graphql", "vue", "lock",
|
||||
];
|
||||
|
||||
fn is_text_ext(path: &str) -> bool {
|
||||
let lower = path.to_lowercase();
|
||||
TEXT_EXTS.iter().any(|&e| lower.ends_with(&format!(".{}", e)))
|
||||
}
|
||||
|
||||
fn is_binary_content(data: &[u8]) -> bool {
|
||||
data.iter().take(8192).any(|&b| b == 0)
|
||||
}
|
||||
|
||||
async fn git_grep_exec(
|
||||
ctx: GitToolCtx,
|
||||
args: serde_json::Value,
|
||||
) -> Result<serde_json::Value, String> {
|
||||
let p: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_value(args).map_err(|e| e.to_string())?;
|
||||
|
||||
let project_name = p
|
||||
.get("project_name")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing project_name")?;
|
||||
let repo_name = p
|
||||
.get("repo_name")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing repo_name")?;
|
||||
let rev = p
|
||||
.get("rev")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(String::from)
|
||||
.unwrap_or_else(|| "HEAD".to_string());
|
||||
let pattern = p
|
||||
.get("pattern")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing pattern")?;
|
||||
let glob = p.get("glob").and_then(|v| v.as_str()).map(String::from);
|
||||
let is_regex = p
|
||||
.get("is_regex")
|
||||
.and_then(|v| v.as_bool())
|
||||
.unwrap_or(true);
|
||||
let context_lines = p
|
||||
.get("context_lines")
|
||||
.and_then(|v| v.as_u64())
|
||||
.unwrap_or(0) as usize;
|
||||
let max_results = p
|
||||
.get("max_results")
|
||||
.and_then(|v| v.as_u64())
|
||||
.unwrap_or(100) as usize;
|
||||
|
||||
let domain = ctx.open_repo(project_name, repo_name).await?;
|
||||
|
||||
// Resolve revision to commit oid
|
||||
let commit_oid = if rev.len() >= 40 {
|
||||
git::commit::types::CommitOid::new(&rev)
|
||||
} else {
|
||||
domain
|
||||
.commit_get_prefix(&rev)
|
||||
.map_err(|e| e.to_string())?
|
||||
.oid
|
||||
};
|
||||
|
||||
let regex = if is_regex {
|
||||
RegexBuilder::new(pattern)
|
||||
.case_insensitive(true)
|
||||
.build()
|
||||
.map_err(|e| format!("invalid regex '{}': {}", pattern, e))?
|
||||
} else {
|
||||
// Escape for literal search
|
||||
RegexBuilder::new(®ex::escape(pattern))
|
||||
.case_insensitive(true)
|
||||
.build()
|
||||
.map_err(|e| e.to_string())?
|
||||
};
|
||||
|
||||
// Recursive tree walk using git2
|
||||
let repo = domain.repo();
|
||||
let commit = repo
|
||||
.find_commit(commit_oid.to_oid().map_err(|e| e.to_string())?)
|
||||
.map_err(|e| e.to_string())?;
|
||||
let tree = commit.tree().map_err(|e| e.to_string())?;
|
||||
|
||||
let mut results: Vec<serde_json::Value> = Vec::new();
|
||||
// Stack: (tree, current_path_prefix)
|
||||
let mut stack: Vec<(git2::Tree<'_>, String)> = vec![(tree, String::new())];
|
||||
|
||||
while let Some((current_tree, current_prefix)) = stack.pop() {
|
||||
for entry in current_tree.iter() {
|
||||
let name = entry.name().unwrap_or_default();
|
||||
if name.is_empty() {
|
||||
continue;
|
||||
}
|
||||
let path: String = if current_prefix.is_empty() {
|
||||
name.to_string()
|
||||
} else {
|
||||
format!("{}/{}", current_prefix, name)
|
||||
};
|
||||
|
||||
if entry.kind() == Some(git2::ObjectType::Tree) {
|
||||
if let Some(subtree) = entry.to_object(&repo).ok().and_then(|o| o.into_tree().ok()) {
|
||||
stack.push((subtree, path));
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
if entry.kind() != Some(git2::ObjectType::Blob) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Glob filter
|
||||
if let Some(ref g) = glob {
|
||||
if !glob_match(&path, g) {
|
||||
continue;
|
||||
}
|
||||
} else if !is_text_ext(&path) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Read blob content
|
||||
let blob = match entry.to_object(&repo).ok().and_then(|o| o.into_blob().ok()) {
|
||||
Some(b) => b,
|
||||
None => continue,
|
||||
};
|
||||
|
||||
let size = blob.size();
|
||||
if size == 0 || size > MAX_FILE_SIZE {
|
||||
continue;
|
||||
}
|
||||
|
||||
let data = blob.content();
|
||||
if is_binary_content(data) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let content = match String::from_utf8(data.to_vec()) {
|
||||
Ok(s) => s,
|
||||
Err(_) => continue,
|
||||
};
|
||||
|
||||
// Search line by line
|
||||
let lines: Vec<&str> = content.lines().collect();
|
||||
for (line_idx, line) in lines.iter().enumerate() {
|
||||
if regex.is_match(line) {
|
||||
let start = line_idx.saturating_sub(context_lines);
|
||||
let end = (line_idx + context_lines + 1).min(lines.len());
|
||||
|
||||
let context: Vec<String> = lines[start..end]
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, l)| {
|
||||
let line_num = start + i + 1;
|
||||
let prefix = if start + i == line_idx { ">" } else { " " };
|
||||
format!("{}{}: {}", prefix, line_num, l)
|
||||
})
|
||||
.collect();
|
||||
|
||||
results.push(serde_json::json!({
|
||||
"file": path,
|
||||
"line_number": line_idx + 1,
|
||||
"match": line,
|
||||
"context": context.join("\n"),
|
||||
}));
|
||||
|
||||
if results.len() >= max_results {
|
||||
return Ok(serde_json::json!({
|
||||
"query": pattern,
|
||||
"rev": rev,
|
||||
"total_matches": results.len(),
|
||||
"truncated": true,
|
||||
"results": results
|
||||
}));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(serde_json::json!({
|
||||
"query": pattern,
|
||||
"rev": rev,
|
||||
"total_matches": results.len(),
|
||||
"truncated": false,
|
||||
"results": results
|
||||
}))
|
||||
}
|
||||
|
||||
fn glob_match(path: &str, pattern: &str) -> bool {
|
||||
// Simple glob: support *, ?, **
|
||||
let parts: Vec<&str> = pattern.split('/').collect();
|
||||
let path_parts: Vec<&str> = path.split('/').collect();
|
||||
let _path_lower = path.to_lowercase();
|
||||
let pattern_lower = pattern.to_lowercase();
|
||||
|
||||
fn matches_part(path_part: &str, pattern_part: &str) -> bool {
|
||||
if pattern_part.is_empty() || pattern_part == "*" {
|
||||
return true;
|
||||
}
|
||||
if pattern_part == "**" {
|
||||
return true;
|
||||
}
|
||||
if let Some(star) = pattern_part.find('*') {
|
||||
let (prefix, suffix) = pattern_part.split_at(star);
|
||||
let suffix = if suffix.starts_with('*') { &suffix[1..] } else { suffix };
|
||||
if !prefix.is_empty() && !path_part.starts_with(prefix) {
|
||||
return false;
|
||||
}
|
||||
if !suffix.is_empty() && !path_part.ends_with(suffix) {
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
path_part == pattern_part
|
||||
}
|
||||
|
||||
if parts.len() == 1 {
|
||||
// Simple glob pattern on filename only
|
||||
let file_name = path_parts.last().unwrap_or(&"");
|
||||
return matches_part(file_name, &pattern_lower);
|
||||
}
|
||||
|
||||
// Multi-part glob
|
||||
let mut pi = 0;
|
||||
for part in &parts {
|
||||
while pi < path_parts.len() {
|
||||
if matches_part(path_parts[pi], part) {
|
||||
pi += 1;
|
||||
break;
|
||||
}
|
||||
if *part != "**" {
|
||||
return false;
|
||||
}
|
||||
pi += 1;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
pub fn register_grep_tools(registry: &mut ToolRegistry) {
|
||||
let p = HashMap::from([
|
||||
("project_name".into(), ToolParam {
|
||||
name: "project_name".into(),
|
||||
param_type: "string".into(),
|
||||
description: Some("Project name (slug)".into()),
|
||||
required: true,
|
||||
properties: None,
|
||||
items: None,
|
||||
}),
|
||||
("repo_name".into(), ToolParam {
|
||||
name: "repo_name".into(),
|
||||
param_type: "string".into(),
|
||||
description: Some("Repository name".into()),
|
||||
required: true,
|
||||
properties: None,
|
||||
items: None,
|
||||
}),
|
||||
("pattern".into(), ToolParam {
|
||||
name: "pattern".into(),
|
||||
param_type: "string".into(),
|
||||
description: Some("Search pattern (regex or literal string)".into()),
|
||||
required: true,
|
||||
properties: None,
|
||||
items: None,
|
||||
}),
|
||||
("rev".into(), ToolParam {
|
||||
name: "rev".into(),
|
||||
param_type: "string".into(),
|
||||
description: Some("Git revision to search in (branch, tag, commit). Default: HEAD".into()),
|
||||
required: false,
|
||||
properties: None,
|
||||
items: None,
|
||||
}),
|
||||
("glob".into(), ToolParam {
|
||||
name: "glob".into(),
|
||||
param_type: "string".into(),
|
||||
description: Some("File glob pattern to filter (e.g. *.rs, src/**/*.ts)".into()),
|
||||
required: false,
|
||||
properties: None,
|
||||
items: None,
|
||||
}),
|
||||
("is_regex".into(), ToolParam {
|
||||
name: "is_regex".into(),
|
||||
param_type: "boolean".into(),
|
||||
description: Some("If true, pattern is a regex. If false, literal string. Default: true".into()),
|
||||
required: false,
|
||||
properties: None,
|
||||
items: None,
|
||||
}),
|
||||
("context_lines".into(), ToolParam {
|
||||
name: "context_lines".into(),
|
||||
param_type: "integer".into(),
|
||||
description: Some("Number of surrounding lines to include for each match. Default: 0".into()),
|
||||
required: false,
|
||||
properties: None,
|
||||
items: None,
|
||||
}),
|
||||
("max_results".into(), ToolParam {
|
||||
name: "max_results".into(),
|
||||
param_type: "integer".into(),
|
||||
description: Some("Maximum number of matches to return. Default: 100".into()),
|
||||
required: false,
|
||||
properties: None,
|
||||
items: None,
|
||||
}),
|
||||
]);
|
||||
|
||||
let schema = ToolSchema {
|
||||
schema_type: "object".into(),
|
||||
properties: Some(p),
|
||||
required: Some(vec!["project_name".into(), "repo_name".into(), "pattern".into()]),
|
||||
};
|
||||
|
||||
registry.register(
|
||||
ToolDefinition::new("git_grep")
|
||||
.description("Search for a text pattern across all files in a repository at a given revision. Supports regex, glob filtering, and line-level context. Skips binary files automatically.")
|
||||
.parameters(schema),
|
||||
ToolHandler::new(|ctx, args| {
|
||||
let gctx = GitToolCtx::new(ctx);
|
||||
Box::pin(async move {
|
||||
git_grep_exec(gctx, args)
|
||||
.await
|
||||
.map_err(agent::ToolError::ExecutionError)
|
||||
})
|
||||
}),
|
||||
);
|
||||
}
|
||||
275
libs/service/file_tools/json.rs
Normal file
275
libs/service/file_tools/json.rs
Normal file
@ -0,0 +1,275 @@
|
||||
//! read_json — parse, validate, and query JSON / JSONC files.
|
||||
|
||||
use crate::file_tools::MAX_FILE_SIZE;
|
||||
use crate::git_tools::ctx::GitToolCtx;
|
||||
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
|
||||
use serde_json::Value as JsonValue;
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Remove comments from JSONC (lines starting with // or /* */) for parsing.
|
||||
fn strip_jsonc_comments(input: &str) -> String {
|
||||
let mut result = String::with_capacity(input.len());
|
||||
let mut chars = input.chars().peekable();
|
||||
let mut in_string = false;
|
||||
let mut escaped = false;
|
||||
|
||||
while let Some(c) = chars.next() {
|
||||
if escaped {
|
||||
result.push(c);
|
||||
escaped = false;
|
||||
continue;
|
||||
}
|
||||
|
||||
if c == '\\' && in_string {
|
||||
result.push(c);
|
||||
escaped = true;
|
||||
continue;
|
||||
}
|
||||
|
||||
if c == '"' {
|
||||
result.push(c);
|
||||
in_string = !in_string;
|
||||
continue;
|
||||
}
|
||||
|
||||
if !in_string {
|
||||
if c == '/' {
|
||||
if let Some(&next) = chars.peek() {
|
||||
if next == '/' {
|
||||
// Line comment — skip to end of line
|
||||
chars.next();
|
||||
while let Some(nc) = chars.next() {
|
||||
if nc == '\n' {
|
||||
result.push(nc);
|
||||
break;
|
||||
}
|
||||
}
|
||||
continue;
|
||||
} else if next == '*' {
|
||||
// Block comment — skip until */
|
||||
chars.next();
|
||||
while let Some(nc) = chars.next() {
|
||||
if nc == '*' {
|
||||
if let Some(&'/') = chars.peek() {
|
||||
chars.next();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result.push(c);
|
||||
}
|
||||
|
||||
result
|
||||
}
|
||||
|
||||
fn infer_schema(value: &JsonValue, max_depth: usize) -> JsonValue {
|
||||
if max_depth == 0 {
|
||||
return serde_json::json!({ "type": "MAX_DEPTH" });
|
||||
}
|
||||
|
||||
match value {
|
||||
JsonValue::Null => serde_json::json!({ "type": "null" }),
|
||||
JsonValue::Bool(_) => serde_json::json!({ "type": "boolean" }),
|
||||
JsonValue::Number(_) => serde_json::json!({ "type": "number" }),
|
||||
JsonValue::String(_) => serde_json::json!({ "type": "string" }),
|
||||
JsonValue::Array(arr) => {
|
||||
if arr.is_empty() {
|
||||
serde_json::json!({ "type": "array", "items": null })
|
||||
} else {
|
||||
serde_json::json!({
|
||||
"type": "array",
|
||||
"length": arr.len(),
|
||||
"items": infer_schema(&arr[0], max_depth - 1)
|
||||
})
|
||||
}
|
||||
}
|
||||
JsonValue::Object(obj) => {
|
||||
let mut schema = serde_json::Map::new();
|
||||
schema.insert("type".into(), serde_json::Value::String("object".into()));
|
||||
let mut properties = serde_json::Map::new();
|
||||
for (k, v) in obj {
|
||||
properties.insert(k.clone(), infer_schema(v, max_depth - 1));
|
||||
}
|
||||
schema.insert("properties".into(), serde_json::Value::Object(properties));
|
||||
schema.insert("keyCount".into(), serde_json::json!(obj.len()));
|
||||
serde_json::Value::Object(schema)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn read_json_exec(
|
||||
ctx: GitToolCtx,
|
||||
args: serde_json::Value,
|
||||
) -> Result<serde_json::Value, String> {
|
||||
let p: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_value(args).map_err(|e| e.to_string())?;
|
||||
|
||||
let project_name = p
|
||||
.get("project_name")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing project_name")?;
|
||||
let repo_name = p
|
||||
.get("repo_name")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing repo_name")?;
|
||||
let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
|
||||
let rev = p
|
||||
.get("rev")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(String::from)
|
||||
.unwrap_or_else(|| "HEAD".to_string());
|
||||
let query = p.get("query").and_then(|v| v.as_str()).map(String::from);
|
||||
let max_depth = p.get("schema_depth").and_then(|v| v.as_u64()).unwrap_or(4) as usize;
|
||||
let pretty = p.get("pretty").and_then(|v| v.as_bool()).unwrap_or(false);
|
||||
|
||||
let domain = ctx.open_repo(project_name, repo_name).await?;
|
||||
|
||||
let commit_oid = if rev.len() >= 40 {
|
||||
git::commit::types::CommitOid::new(&rev)
|
||||
} else {
|
||||
domain
|
||||
.commit_get_prefix(&rev)
|
||||
.map_err(|e| e.to_string())?
|
||||
.oid
|
||||
};
|
||||
|
||||
let entry = domain
|
||||
.tree_entry_by_path_from_commit(&commit_oid, path)
|
||||
.map_err(|e| e.to_string())?;
|
||||
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
|
||||
|
||||
let data = &content.content;
|
||||
if data.len() > MAX_FILE_SIZE {
|
||||
return Err(format!(
|
||||
"file too large ({} bytes), max {} bytes",
|
||||
data.len(),
|
||||
MAX_FILE_SIZE
|
||||
));
|
||||
}
|
||||
|
||||
let text = String::from_utf8_lossy(data);
|
||||
let is_jsonc = path.ends_with(".jsonc") || path.ends_with(".vscodeignore") || text.contains("//");
|
||||
|
||||
let json_text = if is_jsonc {
|
||||
strip_jsonc_comments(&text)
|
||||
} else {
|
||||
text.to_string()
|
||||
};
|
||||
|
||||
let parsed: JsonValue = serde_json::from_str(&json_text)
|
||||
.map_err(|e| format!("JSON parse error at {}: {}", e.line(), e))?;
|
||||
|
||||
// Apply JSONPath-like query
|
||||
let result = if let Some(ref q) = query {
|
||||
query_json(&parsed, q)?
|
||||
} else {
|
||||
parsed
|
||||
};
|
||||
|
||||
let schema = infer_schema(&result, max_depth);
|
||||
|
||||
let display = if pretty {
|
||||
serde_json::to_string_pretty(&result).unwrap_or_default()
|
||||
} else {
|
||||
serde_json::to_string(&result).unwrap_or_default()
|
||||
};
|
||||
|
||||
Ok(serde_json::json!({
|
||||
"path": path,
|
||||
"rev": rev,
|
||||
"format": if is_jsonc { "jsonc" } else { "json" },
|
||||
"size_bytes": data.len(),
|
||||
"schema": schema,
|
||||
"data": if display.chars().count() > 5000 {
|
||||
format!("{}... (truncated, {} chars total)", &display[..5000], display.chars().count())
|
||||
} else { display },
|
||||
}))
|
||||
}
|
||||
|
||||
/// Simple JSONPath-like query support.
|
||||
/// Supports: $.key, $[0], $.key.nested, $.arr[0].field
|
||||
fn query_json(value: &JsonValue, query: &str) -> Result<JsonValue, String> {
|
||||
let query = query.trim();
|
||||
let query = if query.starts_with("$.") {
|
||||
&query[2..]
|
||||
} else if query.starts_with('$') && query.len() > 1 {
|
||||
&query[1..]
|
||||
} else {
|
||||
query
|
||||
};
|
||||
|
||||
let mut current = value.clone();
|
||||
|
||||
for part in query.split('.') {
|
||||
if part.is_empty() {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Handle array index like [0]
|
||||
if let Some(idx_start) = part.find('[') {
|
||||
let key = &part[..idx_start];
|
||||
if !key.is_empty() {
|
||||
if let JsonValue::Object(obj) = ¤t {
|
||||
current = obj.get(key).cloned().unwrap_or(JsonValue::Null);
|
||||
} else {
|
||||
return Err(format!("cannot access property '{}' on non-object", key));
|
||||
}
|
||||
}
|
||||
|
||||
let rest = &part[idx_start..];
|
||||
for bracket in rest.split_inclusive(']') {
|
||||
if bracket.is_empty() || bracket == "]" {
|
||||
continue;
|
||||
}
|
||||
let inner = bracket.trim_end_matches(']');
|
||||
if let Some(idx) = inner.strip_prefix('[') {
|
||||
if let Ok(index) = idx.parse::<usize>() {
|
||||
if let JsonValue::Array(arr) = ¤t {
|
||||
current = arr.get(index).cloned().unwrap_or(JsonValue::Null);
|
||||
} else {
|
||||
return Err(format!("index {} on non-array", index));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if let JsonValue::Object(obj) = ¤t {
|
||||
current = obj.get(part).cloned().unwrap_or(JsonValue::Null);
|
||||
} else {
|
||||
return Err(format!("property '{}' not found", part));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(current)
|
||||
}
|
||||
|
||||
pub fn register_json_tools(registry: &mut ToolRegistry) {
|
||||
let p = HashMap::from([
|
||||
("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
|
||||
("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
|
||||
("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the JSON or JSONC file".into()), required: true, properties: None, items: None }),
|
||||
("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }),
|
||||
("query".into(), ToolParam { name: "query".into(), param_type: "string".into(), description: Some("JSONPath-like query (e.g. $.config.items[0].name) to extract a subset of the document".into()), required: false, properties: None, items: None }),
|
||||
("schema_depth".into(), ToolParam { name: "schema_depth".into(), param_type: "integer".into(), description: Some("How deep to infer the JSON schema (default: 4)".into()), required: false, properties: None, items: None }),
|
||||
("pretty".into(), ToolParam { name: "pretty".into(), param_type: "boolean".into(), description: Some("Pretty-print the output (default: false)".into()), required: false, properties: None, items: None }),
|
||||
]);
|
||||
let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
|
||||
registry.register(
|
||||
ToolDefinition::new("read_json")
|
||||
.description("Parse, validate, and query JSON and JSONC files. Supports JSONPath-like queries ($.key, $.arr[0]), schema inference, and pretty-printing. Automatically detects JSONC (with // comments).")
|
||||
.parameters(schema),
|
||||
ToolHandler::new(|ctx, args| {
|
||||
let gctx = GitToolCtx::new(ctx);
|
||||
Box::pin(async move {
|
||||
read_json_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
|
||||
})
|
||||
}),
|
||||
);
|
||||
}
|
||||
286
libs/service/file_tools/markdown.rs
Normal file
286
libs/service/file_tools/markdown.rs
Normal file
@ -0,0 +1,286 @@
|
||||
//! read_markdown — parse and analyze Markdown files.
|
||||
|
||||
use crate::file_tools::MAX_FILE_SIZE;
|
||||
use crate::git_tools::ctx::GitToolCtx;
|
||||
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
|
||||
use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Parser, Tag, TagEnd};
|
||||
use std::collections::HashMap;
|
||||
|
||||
async fn read_markdown_exec(
|
||||
ctx: GitToolCtx,
|
||||
args: serde_json::Value,
|
||||
) -> Result<serde_json::Value, String> {
|
||||
let p: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_value(args).map_err(|e| e.to_string())?;
|
||||
|
||||
let project_name = p
|
||||
.get("project_name")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing project_name")?;
|
||||
let repo_name = p
|
||||
.get("repo_name")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing repo_name")?;
|
||||
let path = p
|
||||
.get("path")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing path")?;
|
||||
let rev = p
|
||||
.get("rev")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(String::from)
|
||||
.unwrap_or_else(|| "HEAD".to_string());
|
||||
let include_code = p
|
||||
.get("include_code")
|
||||
.and_then(|v| v.as_bool())
|
||||
.unwrap_or(true);
|
||||
let sections_only = p
|
||||
.get("sections_only")
|
||||
.and_then(|v| v.as_bool())
|
||||
.unwrap_or(false);
|
||||
|
||||
let domain = ctx.open_repo(project_name, repo_name).await?;
|
||||
|
||||
let commit_oid = if rev.len() >= 40 {
|
||||
git::commit::types::CommitOid::new(&rev)
|
||||
} else {
|
||||
domain
|
||||
.commit_get_prefix(&rev)
|
||||
.map_err(|e| e.to_string())?
|
||||
.oid
|
||||
};
|
||||
|
||||
let entry = domain
|
||||
.tree_entry_by_path_from_commit(&commit_oid, path)
|
||||
.map_err(|e| e.to_string())?;
|
||||
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
|
||||
|
||||
let data = &content.content;
|
||||
if data.len() > MAX_FILE_SIZE {
|
||||
return Err(format!(
|
||||
"file too large ({} bytes), max {} bytes",
|
||||
data.len(),
|
||||
MAX_FILE_SIZE
|
||||
));
|
||||
}
|
||||
|
||||
let text = String::from_utf8_lossy(data);
|
||||
let parser = Parser::new(&text);
|
||||
|
||||
let mut sections: Vec<serde_json::Value> = Vec::new();
|
||||
let mut code_blocks: Vec<serde_json::Value> = Vec::new();
|
||||
let mut links: Vec<serde_json::Value> = Vec::new();
|
||||
let mut images: Vec<serde_json::Value> = Vec::new();
|
||||
|
||||
let mut current_heading_level: Option<u32> = None;
|
||||
let mut current_heading_text = String::new();
|
||||
let mut in_code_block = false;
|
||||
let mut code_block_lang = String::new();
|
||||
let mut code_block_content = String::new();
|
||||
|
||||
let mut toc: Vec<serde_json::Value> = Vec::new();
|
||||
|
||||
for event in parser {
|
||||
match event {
|
||||
Event::Start(Tag::Heading { level, .. }) => {
|
||||
current_heading_level = Some(match level {
|
||||
HeadingLevel::H1 => 1,
|
||||
HeadingLevel::H2 => 2,
|
||||
HeadingLevel::H3 => 3,
|
||||
HeadingLevel::H4 => 4,
|
||||
HeadingLevel::H5 => 5,
|
||||
HeadingLevel::H6 => 6,
|
||||
});
|
||||
current_heading_text.clear();
|
||||
}
|
||||
Event::End(TagEnd::Heading(level)) => {
|
||||
let lvl = match level {
|
||||
HeadingLevel::H1 => 1,
|
||||
HeadingLevel::H2 => 2,
|
||||
HeadingLevel::H3 => 3,
|
||||
HeadingLevel::H4 => 4,
|
||||
HeadingLevel::H5 => 5,
|
||||
HeadingLevel::H6 => 6,
|
||||
};
|
||||
let heading = current_heading_text.trim().to_string();
|
||||
if !heading.is_empty() {
|
||||
let section = serde_json::json!({
|
||||
"level": lvl,
|
||||
"title": heading,
|
||||
});
|
||||
toc.push(section.clone());
|
||||
if !sections_only {
|
||||
sections.push(serde_json::json!({
|
||||
"level": lvl,
|
||||
"title": heading,
|
||||
"content": "",
|
||||
}));
|
||||
}
|
||||
}
|
||||
current_heading_level = None;
|
||||
}
|
||||
Event::Text(text) => {
|
||||
if in_code_block {
|
||||
code_block_content.push_str(&text);
|
||||
code_block_content.push('\n');
|
||||
} else if let Some(_) = current_heading_level {
|
||||
current_heading_text.push_str(&text);
|
||||
current_heading_text.push(' ');
|
||||
}
|
||||
}
|
||||
Event::Code(code) => {
|
||||
code_blocks.push(serde_json::json!({
|
||||
"language": "",
|
||||
"code": code.as_ref(),
|
||||
}));
|
||||
}
|
||||
Event::Start(Tag::CodeBlock(kind)) => {
|
||||
in_code_block = true;
|
||||
code_block_content.clear();
|
||||
code_block_lang = match kind {
|
||||
CodeBlockKind::Fenced(info) => info.as_ref().to_string(),
|
||||
CodeBlockKind::Indented => String::new(),
|
||||
};
|
||||
}
|
||||
Event::End(TagEnd::CodeBlock) => {
|
||||
in_code_block = false;
|
||||
if include_code {
|
||||
code_blocks.push(serde_json::json!({
|
||||
"language": code_block_lang,
|
||||
"code": code_block_content.trim().to_string(),
|
||||
}));
|
||||
}
|
||||
code_block_lang.clear();
|
||||
}
|
||||
Event::Start(Tag::Link { dest_url, .. }) => {
|
||||
links.push(serde_json::json!({ "url": dest_url.to_string() }));
|
||||
}
|
||||
Event::Start(Tag::Image { dest_url, .. }) => {
|
||||
images.push(serde_json::json!({ "url": dest_url.to_string() }));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
// Build outline (h1/h2/h3 only)
|
||||
let outline: Vec<serde_json::Value> = toc
|
||||
.iter()
|
||||
.filter(|s| {
|
||||
let lvl = s.get("level").and_then(|v| v.as_u64()).unwrap_or(0) as u32;
|
||||
lvl <= 3
|
||||
})
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
Ok(serde_json::json!({
|
||||
"path": path,
|
||||
"rev": rev,
|
||||
"stats": {
|
||||
"chars": text.chars().count(),
|
||||
"words": text.split_whitespace().count(),
|
||||
"lines": text.lines().count(),
|
||||
"headings": toc.len(),
|
||||
"code_blocks": code_blocks.len(),
|
||||
"links": links.len(),
|
||||
"images": images.len(),
|
||||
},
|
||||
"outline": outline,
|
||||
"headings": toc,
|
||||
"code_blocks": if include_code { code_blocks } else { vec![] },
|
||||
"links": links,
|
||||
"images": images,
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn register_markdown_tools(registry: &mut ToolRegistry) {
|
||||
let p = HashMap::from([
|
||||
(
|
||||
"project_name".into(),
|
||||
ToolParam {
|
||||
name: "project_name".into(),
|
||||
param_type: "string".into(),
|
||||
description: Some("Project name (slug)".into()),
|
||||
required: true,
|
||||
properties: None,
|
||||
items: None,
|
||||
},
|
||||
),
|
||||
(
|
||||
"repo_name".into(),
|
||||
ToolParam {
|
||||
name: "repo_name".into(),
|
||||
param_type: "string".into(),
|
||||
description: Some("Repository name".into()),
|
||||
required: true,
|
||||
properties: None,
|
||||
items: None,
|
||||
},
|
||||
),
|
||||
(
|
||||
"path".into(),
|
||||
ToolParam {
|
||||
name: "path".into(),
|
||||
param_type: "string".into(),
|
||||
description: Some("File path to the Markdown file".into()),
|
||||
required: true,
|
||||
properties: None,
|
||||
items: None,
|
||||
},
|
||||
),
|
||||
(
|
||||
"rev".into(),
|
||||
ToolParam {
|
||||
name: "rev".into(),
|
||||
param_type: "string".into(),
|
||||
description: Some("Git revision (default: HEAD)".into()),
|
||||
required: false,
|
||||
properties: None,
|
||||
items: None,
|
||||
},
|
||||
),
|
||||
(
|
||||
"sections_only".into(),
|
||||
ToolParam {
|
||||
name: "sections_only".into(),
|
||||
param_type: "boolean".into(),
|
||||
description: Some(
|
||||
"If true, return only section headings (outline). Default: false".into(),
|
||||
),
|
||||
required: false,
|
||||
properties: None,
|
||||
items: None,
|
||||
},
|
||||
),
|
||||
(
|
||||
"include_code".into(),
|
||||
ToolParam {
|
||||
name: "include_code".into(),
|
||||
param_type: "boolean".into(),
|
||||
description: Some("Include code blocks in result. Default: true".into()),
|
||||
required: false,
|
||||
properties: None,
|
||||
items: None,
|
||||
},
|
||||
),
|
||||
]);
|
||||
let schema = ToolSchema {
|
||||
schema_type: "object".into(),
|
||||
properties: Some(p),
|
||||
required: Some(vec![
|
||||
"project_name".into(),
|
||||
"repo_name".into(),
|
||||
"path".into(),
|
||||
]),
|
||||
};
|
||||
registry.register(
|
||||
ToolDefinition::new("read_markdown")
|
||||
.description("Parse and analyze a Markdown file. Returns document statistics, heading outline, code blocks with languages, links, and images.")
|
||||
.parameters(schema),
|
||||
ToolHandler::new(|ctx, args| {
|
||||
let gctx = GitToolCtx::new(ctx);
|
||||
Box::pin(async move {
|
||||
read_markdown_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
|
||||
})
|
||||
}),
|
||||
);
|
||||
}
|
||||
39
libs/service/file_tools/mod.rs
Normal file
39
libs/service/file_tools/mod.rs
Normal file
@ -0,0 +1,39 @@
|
||||
//! File reading and search tools for AI agents.
|
||||
//!
|
||||
//! Tools for reading structured files (CSV, Excel, Word, PDF, PPT, Markdown,
|
||||
//! SQL, JSON) and searching across repository files (git_grep).
|
||||
//!
|
||||
//! All tools operate on repository blobs (read via git context) or standalone
|
||||
//! content, returning structured JSON suitable for AI consumption.
|
||||
|
||||
pub mod csv;
|
||||
// TODO: fix calamine 0.26 API compatibility (open_workbook path requirement)
|
||||
// pub mod excel;
|
||||
pub mod grep;
|
||||
pub mod json;
|
||||
pub mod markdown;
|
||||
// TODO: fix lopdf 0.34 API (no load_from_mem, different stream API)
|
||||
// pub mod pdf;
|
||||
// TODO: fix ppt archive borrow checker issue
|
||||
// pub mod ppt;
|
||||
pub mod sql;
|
||||
// TODO: fix quick-xml 0.37 + zip Cursor API
|
||||
// pub mod word;
|
||||
|
||||
use agent::ToolRegistry;
|
||||
|
||||
/// Maximum number of bytes to read from any single file (prevents huge blobs).
|
||||
const MAX_FILE_SIZE: usize = 2 * 1024 * 1024; // 2MB
|
||||
|
||||
/// Registers all file tools into a ToolRegistry.
|
||||
pub fn register_all(registry: &mut ToolRegistry) {
|
||||
grep::register_grep_tools(registry);
|
||||
csv::register_csv_tools(registry);
|
||||
// excel::register_excel_tools(registry);
|
||||
// word::register_word_tools(registry);
|
||||
// pdf::register_pdf_tools(registry);
|
||||
// ppt::register_ppt_tools(registry);
|
||||
markdown::register_markdown_tools(registry);
|
||||
sql::register_sql_tools(registry);
|
||||
json::register_json_tools(registry);
|
||||
}
|
||||
244
libs/service/file_tools/pdf.rs
Normal file
244
libs/service/file_tools/pdf.rs
Normal file
@ -0,0 +1,244 @@
|
||||
//! read_pdf — extract text from PDF files.
|
||||
|
||||
use crate::file_tools::MAX_FILE_SIZE;
|
||||
use crate::git_tools::ctx::GitToolCtx;
|
||||
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
|
||||
use futures::FutureExt;
|
||||
use lopdf::{Document, Object, ObjectId};
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Extract text content from a PDF page's content stream.
|
||||
fn extract_page_text(doc: &Document, page_id: ObjectId) -> String {
|
||||
let mut text = String::new();
|
||||
|
||||
// Get page dictionary
|
||||
let page_dict = match doc.get(page_id) {
|
||||
Ok(dict) => dict,
|
||||
Err(_) => return text,
|
||||
};
|
||||
|
||||
// Get content streams (can be a single stream or array)
|
||||
let content_streams = match page_dict.get(b"Contents") {
|
||||
Ok(obj) => obj.clone(),
|
||||
Err(_) => return text,
|
||||
};
|
||||
|
||||
let stream_ids: Vec<ObjectId> = match &content_streams {
|
||||
Object::Reference(id) => vec![*id],
|
||||
Object::Array(arr) => arr
|
||||
.iter()
|
||||
.filter_map(|o| {
|
||||
if let Object::Reference(id) = o {
|
||||
Some(*id)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect(),
|
||||
_ => return text,
|
||||
};
|
||||
|
||||
for stream_id in stream_ids {
|
||||
if let Ok((_, stream)) = doc.get_stream(stream_id) {
|
||||
// Decode the stream
|
||||
if let Ok(decompressed) = stream.decompressed_content() {
|
||||
text.push_str(&extract_text_from_content(&decompress_pdf_stream(&decompressed)));
|
||||
text.push('\n');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
text
|
||||
}
|
||||
|
||||
/// Very simple PDF content stream text extraction.
|
||||
/// Handles Tj, TJ, Td, T*, ', " operators.
|
||||
fn extract_text_from_content(content: &[u8]) -> String {
|
||||
let data = String::from_utf8_lossy(content);
|
||||
let mut result = String::new();
|
||||
let mut in_parens = false;
|
||||
let mut current_text = String::new();
|
||||
let mut last_was_tj = false;
|
||||
|
||||
let mut chars = data.chars().peekable();
|
||||
|
||||
while let Some(c) = chars.next() {
|
||||
match c {
|
||||
'(' => {
|
||||
in_parens = true;
|
||||
current_text.clear();
|
||||
}
|
||||
')' if in_parens => {
|
||||
in_parens = false;
|
||||
if !current_text.is_empty() {
|
||||
if last_was_tj {
|
||||
// TJ operator: subtract current text width offset
|
||||
}
|
||||
result.push_str(¤t_text);
|
||||
result.push(' ');
|
||||
last_was_tj = false;
|
||||
}
|
||||
}
|
||||
c if in_parens => {
|
||||
if c == '\\' {
|
||||
if let Some(escaped) = chars.next() {
|
||||
match escaped {
|
||||
'n' => current_text.push('\n'),
|
||||
'r' => current_text.push('\r'),
|
||||
't' => current_text.push('\t'),
|
||||
_ => current_text.push(escaped),
|
||||
}
|
||||
}
|
||||
} else {
|
||||
current_text.push(c);
|
||||
}
|
||||
}
|
||||
'%' => {
|
||||
// Comment, skip to end of line
|
||||
while let Some(nc) = chars.next() {
|
||||
if nc == '\n' || nc == '\r' {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
// Clean up excessive newlines
|
||||
let lines: Vec<&str> = result.lines().map(|l| l.trim()).filter(|l| !l.is_empty()).collect();
|
||||
lines.join("\n")
|
||||
}
|
||||
|
||||
fn decompress_pdf_stream(data: &[u8]) -> Vec<u8> {
|
||||
// Try to detect and decompress flate/zlib streams
|
||||
if data.len() < 2 {
|
||||
return data.to_vec();
|
||||
}
|
||||
|
||||
// Simple zlib check: zlib-wrapped deflate starts with 0x78
|
||||
if data.starts_with(&[0x78]) || data.starts_with(&[0x08, 0x1b]) {
|
||||
if let Ok(decoded) = flate2::read::ZlibDecoder::new(data).bytes().collect::<Result<Vec<_>, _>>() {
|
||||
return decoded;
|
||||
}
|
||||
}
|
||||
|
||||
// Try raw deflate
|
||||
if let Ok(decoded) = flate2::read::DeflateDecoder::new(data).bytes().collect::<Result<Vec<_>, _>>() {
|
||||
return decoded;
|
||||
}
|
||||
|
||||
data.to_vec()
|
||||
}
|
||||
|
||||
async fn read_pdf_exec(
|
||||
ctx: GitToolCtx,
|
||||
args: serde_json::Value,
|
||||
) -> Result<serde_json::Value, String> {
|
||||
let p: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_value(args).map_err(|e| e.to_string())?;
|
||||
|
||||
let project_name = p
|
||||
.get("project_name")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing project_name")?;
|
||||
let repo_name = p
|
||||
.get("repo_name")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing repo_name")?;
|
||||
let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
|
||||
let rev = p
|
||||
.get("rev")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(String::from)
|
||||
.unwrap_or_else(|| "HEAD".to_string());
|
||||
let page_start = p.get("page_start").and_then(|v| v.as_u64()).map(|v| v as usize);
|
||||
let page_end = p.get("page_end").and_then(|v| v.as_u64()).map(|v| v as usize);
|
||||
let max_pages = p
|
||||
.get("max_pages")
|
||||
.and_then(|v| v.as_u64())
|
||||
.unwrap_or(20) as usize;
|
||||
|
||||
let domain = ctx.open_repo(project_name, repo_name).await?;
|
||||
|
||||
let commit_oid = if rev.len() >= 40 {
|
||||
git::commit::types::CommitOid::new(&rev)
|
||||
} else {
|
||||
domain
|
||||
.commit_get_prefix(&rev)
|
||||
.map_err(|e| e.to_string())?
|
||||
.oid
|
||||
};
|
||||
|
||||
let entry = domain
|
||||
.tree_entry_by_path_from_commit(&commit_oid, path)
|
||||
.map_err(|e| e.to_string())?;
|
||||
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
|
||||
|
||||
let data = &content.content;
|
||||
if data.len() > MAX_FILE_SIZE {
|
||||
return Err(format!(
|
||||
"file too large ({} bytes), max {} bytes",
|
||||
data.len(),
|
||||
MAX_FILE_SIZE
|
||||
));
|
||||
}
|
||||
|
||||
let doc = Document::load_from_mem(data)
|
||||
.map_err(|e| format!("failed to parse PDF: {}", e))?;
|
||||
|
||||
// Get all page references
|
||||
let pages: Vec<ObjectId> = doc
|
||||
.pages
|
||||
.values()
|
||||
.cloned()
|
||||
.collect();
|
||||
|
||||
let total_pages = pages.len();
|
||||
|
||||
let start = page_start.unwrap_or(0).min(total_pages.saturating_sub(1));
|
||||
let end = page_end.unwrap_or(start + max_pages).min(total_pages);
|
||||
|
||||
let mut page_texts: Vec<serde_json::Value> = Vec::new();
|
||||
|
||||
for (i, page_id) in pages.iter().enumerate().skip(start).take(end - start) {
|
||||
let text = extract_page_text(&doc, *page_id);
|
||||
page_texts.push(serde_json::json!({
|
||||
"page": i + 1,
|
||||
"text": text,
|
||||
"char_count": text.chars().count(),
|
||||
}));
|
||||
}
|
||||
|
||||
Ok(serde_json::json!({
|
||||
"path": path,
|
||||
"rev": rev,
|
||||
"total_pages": total_pages,
|
||||
"extracted_pages": page_texts.len(),
|
||||
"pages": page_texts,
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn register_pdf_tools(registry: &mut ToolRegistry) {
|
||||
let p = HashMap::from([
|
||||
("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
|
||||
("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
|
||||
("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the PDF document".into()), required: true, properties: None, items: None }),
|
||||
("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }),
|
||||
("page_start".into(), ToolParam { name: "page_start".into(), param_type: "integer".into(), description: Some("1-based starting page number (default: 1)".into()), required: false, properties: None, items: None }),
|
||||
("page_end".into(), ToolParam { name: "page_end".into(), param_type: "integer".into(), description: Some("1-based ending page number (default: page_start + 20)".into()), required: false, properties: None, items: None }),
|
||||
("max_pages".into(), ToolParam { name: "max_pages".into(), param_type: "integer".into(), description: Some("Maximum number of pages to extract (default: 20)".into()), required: false, properties: None, items: None }),
|
||||
]);
|
||||
let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
|
||||
registry.register(
|
||||
ToolDefinition::new("read_pdf")
|
||||
.description("Extract text content from PDF files. Returns page-by-page text extraction with character counts. Supports page range selection.")
|
||||
.parameters(schema),
|
||||
ToolHandler::new(|ctx, args| {
|
||||
let gctx = GitToolCtx::new(ctx);
|
||||
Box::pin(async move {
|
||||
read_pdf_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
|
||||
})
|
||||
}),
|
||||
);
|
||||
}
|
||||
204
libs/service/file_tools/ppt.rs
Normal file
204
libs/service/file_tools/ppt.rs
Normal file
@ -0,0 +1,204 @@
|
||||
//! read_ppt — extract text from PowerPoint files (.pptx).
|
||||
|
||||
use crate::file_tools::MAX_FILE_SIZE;
|
||||
use crate::git_tools::ctx::GitToolCtx;
|
||||
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
|
||||
use futures::FutureExt;
|
||||
use std::collections::HashMap;
|
||||
use zip::ZipArchive;
|
||||
|
||||
async fn read_ppt_exec(
|
||||
ctx: GitToolCtx,
|
||||
args: serde_json::Value,
|
||||
) -> Result<serde_json::Value, String> {
|
||||
let p: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_value(args).map_err(|e| e.to_string())?;
|
||||
|
||||
let project_name = p
|
||||
.get("project_name")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing project_name")?;
|
||||
let repo_name = p
|
||||
.get("repo_name")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing repo_name")?;
|
||||
let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
|
||||
let rev = p
|
||||
.get("rev")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(String::from)
|
||||
.unwrap_or_else(|| "HEAD".to_string());
|
||||
let slide_start = p.get("slide_start").and_then(|v| v.as_u64()).map(|v| v as usize);
|
||||
let slide_end = p.get("slide_end").and_then(|v| v.as_u64()).map(|v| v as usize);
|
||||
let include_notes = p
|
||||
.get("include_notes")
|
||||
.and_then(|v| v.as_bool())
|
||||
.unwrap_or(false);
|
||||
|
||||
let domain = ctx.open_repo(project_name, repo_name).await?;
|
||||
|
||||
let commit_oid = if rev.len() >= 40 {
|
||||
git::commit::types::CommitOid::new(&rev)
|
||||
} else {
|
||||
domain
|
||||
.commit_get_prefix(&rev)
|
||||
.map_err(|e| e.to_string())?
|
||||
.oid
|
||||
};
|
||||
|
||||
let entry = domain
|
||||
.tree_entry_by_path_from_commit(&commit_oid, path)
|
||||
.map_err(|e| e.to_string())?;
|
||||
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
|
||||
|
||||
let data = &content.content;
|
||||
if data.len() > MAX_FILE_SIZE {
|
||||
return Err(format!(
|
||||
"file too large ({} bytes), max {} bytes",
|
||||
data.len(),
|
||||
MAX_FILE_SIZE
|
||||
));
|
||||
}
|
||||
|
||||
let cursor = std::io::Cursor::new(data.clone());
|
||||
let mut archive =
|
||||
ZipArchive::new(cursor).map_err(|e| format!("failed to read PPTX ZIP: {}", e))?;
|
||||
|
||||
let mut slides: Vec<serde_json::Value> = Vec::new();
|
||||
|
||||
// Collect all slide file names
|
||||
let mut slide_files: Vec<String> = (1..=1000)
|
||||
.filter_map(|i| {
|
||||
let name = format!("ppt/slides/slide{}.xml", i);
|
||||
if archive.by_name(&name).is_ok() {
|
||||
Some(name)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
let total_slides = slide_files.len();
|
||||
let start = slide_start.unwrap_or(0).min(total_slides.saturating_sub(1));
|
||||
let end = slide_end.unwrap_or(start + 50).min(total_slides);
|
||||
|
||||
for slide_file in slide_files.iter().skip(start).take(end - start) {
|
||||
let slide_idx = slides.len() + start + 1;
|
||||
|
||||
let mut file = archive
|
||||
.by_name(slide_file)
|
||||
.map_err(|e| format!("failed to read slide {}: {}", slide_file, e))?;
|
||||
let mut xml_content = String::new();
|
||||
use std::io::Read;
|
||||
file.read_to_string(&mut xml_content)
|
||||
.map_err(|e| e.to_string())?;
|
||||
|
||||
// Extract text from slide XML
|
||||
let text = extract_text_from_pptx_xml(&xml_content);
|
||||
|
||||
// Optionally extract notes
|
||||
let notes = if include_notes {
|
||||
let notes_file = format!("ppt/notesSlides/notesSlide{}.xml", slide_idx);
|
||||
if let Ok(mut notes_file) = archive.by_name(¬es_file) {
|
||||
let mut notes_xml = String::new();
|
||||
if notes_file.read_to_string(&mut notes_xml).is_ok() {
|
||||
Some(extract_text_from_pptx_xml(¬es_xml))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
}
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
slides.push(serde_json::json!({
|
||||
"slide": slide_idx,
|
||||
"text": text.clone(),
|
||||
"char_count": text.chars().count(),
|
||||
"notes": notes,
|
||||
}));
|
||||
}
|
||||
|
||||
Ok(serde_json::json!({
|
||||
"path": path,
|
||||
"rev": rev,
|
||||
"total_slides": total_slides,
|
||||
"extracted_slides": slides.len(),
|
||||
"slides": slides,
|
||||
}))
|
||||
}
|
||||
|
||||
/// Extract text content from PPTX slide XML using simple tag extraction.
|
||||
fn extract_text_from_pptx_xml(xml: &str) -> String {
|
||||
// PPTX uses <a:t> tags for text content
|
||||
let mut results: Vec<&str> = Vec::new();
|
||||
let mut last_end = 0;
|
||||
|
||||
while let Some(start) = xml[last_end..].find("<a:t") {
|
||||
let abs_start = last_end + start;
|
||||
if let Some(tag_end) = xml[abs_start..].find('>') {
|
||||
let content_start = abs_start + tag_end + 1;
|
||||
if let Some(end_tag) = xml[content_start..].find("</a:t>") {
|
||||
let text = &xml[content_start..content_start + end_tag];
|
||||
let trimmed = text.trim();
|
||||
if !trimmed.is_empty() {
|
||||
results.push(trimmed);
|
||||
}
|
||||
last_end = content_start + end_tag + 7; // len of </a:t>
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Also try <w:t> tags (notes slides use Word namespaces)
|
||||
let mut last_end = 0;
|
||||
while let Some(start) = xml[last_end..].find("<w:t") {
|
||||
let abs_start = last_end + start;
|
||||
if let Some(tag_end) = xml[abs_start..].find('>') {
|
||||
let content_start = abs_start + tag_end + 1;
|
||||
if let Some(end_tag) = xml[content_start..].find("</w:t>") {
|
||||
let text = &xml[content_start..content_start + end_tag];
|
||||
let trimmed = text.trim();
|
||||
if !trimmed.is_empty() && !results.contains(&trimmed) {
|
||||
results.push(trimmed);
|
||||
}
|
||||
last_end = content_start + end_tag + 6; // len of </w:t>
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
results.join(" ")
|
||||
}
|
||||
|
||||
pub fn register_ppt_tools(registry: &mut ToolRegistry) {
|
||||
let p = HashMap::from([
|
||||
("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
|
||||
("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
|
||||
("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the .pptx document".into()), required: true, properties: None, items: None }),
|
||||
("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }),
|
||||
("slide_start".into(), ToolParam { name: "slide_start".into(), param_type: "integer".into(), description: Some("1-based starting slide number (default: 1)".into()), required: false, properties: None, items: None }),
|
||||
("slide_end".into(), ToolParam { name: "slide_end".into(), param_type: "integer".into(), description: Some("1-based ending slide number".into()), required: false, properties: None, items: None }),
|
||||
("include_notes".into(), ToolParam { name: "include_notes".into(), param_type: "boolean".into(), description: Some("Include speaker notes (default: false)".into()), required: false, properties: None, items: None }),
|
||||
]);
|
||||
let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
|
||||
registry.register(
|
||||
ToolDefinition::new("read_ppt")
|
||||
.description("Extract text content from PowerPoint presentations (.pptx). Returns slide-by-slide text with character counts. Supports slide range selection and speaker notes.")
|
||||
.parameters(schema),
|
||||
ToolHandler::new(|ctx, args| {
|
||||
let gctx = GitToolCtx::new(ctx);
|
||||
Box::pin(async move {
|
||||
read_ppt_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
|
||||
})
|
||||
}),
|
||||
);
|
||||
}
|
||||
154
libs/service/file_tools/sql.rs
Normal file
154
libs/service/file_tools/sql.rs
Normal file
@ -0,0 +1,154 @@
|
||||
//! read_sql — parse and analyze SQL files.
|
||||
|
||||
use crate::file_tools::MAX_FILE_SIZE;
|
||||
use crate::git_tools::ctx::GitToolCtx;
|
||||
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
|
||||
use sqlparser::ast::{Statement, ColumnDef};
|
||||
use sqlparser::dialect::{GenericDialect, MySqlDialect, PostgreSqlDialect, SQLiteDialect};
|
||||
use sqlparser::parser::Parser;
|
||||
use std::collections::HashMap;
|
||||
|
||||
async fn read_sql_exec(
|
||||
ctx: GitToolCtx,
|
||||
args: serde_json::Value,
|
||||
) -> Result<serde_json::Value, String> {
|
||||
let p: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_value(args).map_err(|e| e.to_string())?;
|
||||
|
||||
let project_name = p
|
||||
.get("project_name")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing project_name")?;
|
||||
let repo_name = p
|
||||
.get("repo_name")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing repo_name")?;
|
||||
let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
|
||||
let rev = p
|
||||
.get("rev")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(String::from)
|
||||
.unwrap_or_else(|| "HEAD".to_string());
|
||||
let dialect = p.get("dialect").and_then(|v| v.as_str()).unwrap_or("generic");
|
||||
|
||||
let domain = ctx.open_repo(project_name, repo_name).await?;
|
||||
|
||||
let commit_oid = if rev.len() >= 40 {
|
||||
git::commit::types::CommitOid::new(&rev)
|
||||
} else {
|
||||
domain
|
||||
.commit_get_prefix(&rev)
|
||||
.map_err(|e| e.to_string())?
|
||||
.oid
|
||||
};
|
||||
|
||||
let entry = domain
|
||||
.tree_entry_by_path_from_commit(&commit_oid, path)
|
||||
.map_err(|e| e.to_string())?;
|
||||
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
|
||||
|
||||
let data = &content.content;
|
||||
if data.len() > MAX_FILE_SIZE {
|
||||
return Err(format!(
|
||||
"file too large ({} bytes), max {} bytes",
|
||||
data.len(),
|
||||
MAX_FILE_SIZE
|
||||
));
|
||||
}
|
||||
|
||||
let text = String::from_utf8_lossy(data);
|
||||
|
||||
let parser_dialect: Box<dyn sqlparser::dialect::Dialect> = match dialect {
|
||||
"mysql" => Box::new(MySqlDialect {}),
|
||||
"postgresql" | "postgres" => Box::new(PostgreSqlDialect {}),
|
||||
"sqlite" => Box::new(SQLiteDialect {}),
|
||||
_ => Box::new(GenericDialect {}),
|
||||
};
|
||||
|
||||
let statements = Parser::parse_sql(parser_dialect.as_ref(), &text)
|
||||
.map_err(|e| format!("SQL parse error: {}", e))?;
|
||||
|
||||
let mut tables: Vec<serde_json::Value> = Vec::new();
|
||||
let mut views: Vec<serde_json::Value> = Vec::new();
|
||||
let mut functions: Vec<serde_json::Value> = Vec::new();
|
||||
let mut indexes: Vec<serde_json::Value> = Vec::new();
|
||||
let mut statement_kinds: std::collections::HashMap<String, usize> = std::collections::HashMap::new();
|
||||
|
||||
for statement in &statements {
|
||||
let kind = format!("{:?}", statement).split('{').next().unwrap_or("unknown").to_string();
|
||||
*statement_kinds.entry(kind).or_insert(0) += 1;
|
||||
|
||||
match statement {
|
||||
Statement::CreateTable(stmt) => {
|
||||
let name = stmt.name.to_string();
|
||||
let columns: Vec<String> = stmt.columns.iter().map(format_column_def).collect();
|
||||
tables.push(serde_json::json!({
|
||||
"name": name,
|
||||
"columns": columns,
|
||||
"if_not_exists": stmt.if_not_exists,
|
||||
}));
|
||||
}
|
||||
Statement::CreateView { name, query, .. } => {
|
||||
views.push(serde_json::json!({
|
||||
"name": name.to_string(),
|
||||
"query": query.to_string(),
|
||||
}));
|
||||
}
|
||||
Statement::CreateIndex(stmt) => {
|
||||
indexes.push(serde_json::json!({
|
||||
"name": stmt.name.as_ref().map(|n| n.to_string()).unwrap_or_default(),
|
||||
"table": stmt.table_name.to_string(),
|
||||
"columns": stmt.columns.iter().map(|c| c.to_string()).collect::<Vec<_>>(),
|
||||
}));
|
||||
}
|
||||
Statement::CreateFunction(stmt) => {
|
||||
functions.push(serde_json::json!({
|
||||
"name": stmt.name.to_string(),
|
||||
"args": stmt.args.iter().flat_map(|args| args.iter().filter_map(|a| a.name.as_ref().map(|n| n.to_string()))).collect::<Vec<_>>(),
|
||||
"return_type": stmt.return_type.as_ref().map(|r| r.to_string()).unwrap_or_default(),
|
||||
}));
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
|
||||
Ok(serde_json::json!({
|
||||
"path": path,
|
||||
"rev": rev,
|
||||
"dialect": dialect,
|
||||
"statement_count": statements.len(),
|
||||
"statement_kinds": statement_kinds,
|
||||
"tables": tables,
|
||||
"views": views,
|
||||
"functions": functions,
|
||||
"indexes": indexes,
|
||||
}))
|
||||
}
|
||||
|
||||
fn format_column_def(col: &ColumnDef) -> String {
|
||||
let name = col.name.to_string();
|
||||
let data_type = col.data_type.to_string();
|
||||
format!("{} {}", name, data_type)
|
||||
}
|
||||
|
||||
pub fn register_sql_tools(registry: &mut ToolRegistry) {
|
||||
let p = HashMap::from([
|
||||
("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
|
||||
("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
|
||||
("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the SQL file".into()), required: true, properties: None, items: None }),
|
||||
("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }),
|
||||
("dialect".into(), ToolParam { name: "dialect".into(), param_type: "string".into(), description: Some("SQL dialect: generic, mysql, postgresql, sqlite. Default: generic".into()), required: false, properties: None, items: None }),
|
||||
]);
|
||||
let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
|
||||
registry.register(
|
||||
ToolDefinition::new("read_sql")
|
||||
.description("Parse and analyze a SQL file. Extracts CREATE TABLE statements (with columns and types), CREATE VIEW, CREATE INDEX, CREATE FUNCTION, and counts all statement types.")
|
||||
.parameters(schema),
|
||||
ToolHandler::new(|ctx, args| {
|
||||
let gctx = GitToolCtx::new(ctx);
|
||||
Box::pin(async move {
|
||||
read_sql_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
|
||||
})
|
||||
}),
|
||||
);
|
||||
}
|
||||
184
libs/service/file_tools/word.rs
Normal file
184
libs/service/file_tools/word.rs
Normal file
@ -0,0 +1,184 @@
|
||||
//! read_word — parse and extract text from Word documents (.docx) via zip+xml.
|
||||
|
||||
use crate::file_tools::MAX_FILE_SIZE;
|
||||
use crate::git_tools::ctx::GitToolCtx;
|
||||
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
|
||||
use futures::FutureExt;
|
||||
use quick_xml::events::Event;
|
||||
use quick_xml::Reader;
|
||||
use std::collections::HashMap;
|
||||
use zip::ZipArchive;
|
||||
|
||||
async fn read_word_exec(
|
||||
ctx: GitToolCtx,
|
||||
args: serde_json::Value,
|
||||
) -> Result<serde_json::Value, String> {
|
||||
let p: serde_json::Map<String, serde_json::Value> =
|
||||
serde_json::from_value(args).map_err(|e| e.to_string())?;
|
||||
|
||||
let project_name = p
|
||||
.get("project_name")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing project_name")?;
|
||||
let repo_name = p
|
||||
.get("repo_name")
|
||||
.and_then(|v| v.as_str())
|
||||
.ok_or("missing repo_name")?;
|
||||
let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
|
||||
let rev = p
|
||||
.get("rev")
|
||||
.and_then(|v| v.as_str())
|
||||
.map(String::from)
|
||||
.unwrap_or_else(|| "HEAD".to_string());
|
||||
let offset = p.get("offset").and_then(|v| v.as_u64()).unwrap_or(0) as usize;
|
||||
let limit = p
|
||||
.get("limit")
|
||||
.and_then(|v| v.as_u64())
|
||||
.unwrap_or(200) as usize;
|
||||
let sections_only = p
|
||||
.get("sections_only")
|
||||
.and_then(|v| v.as_bool())
|
||||
.unwrap_or(false);
|
||||
|
||||
let domain = ctx.open_repo(project_name, repo_name).await?;
|
||||
|
||||
let commit_oid = if rev.len() >= 40 {
|
||||
git::commit::types::CommitOid::new(&rev)
|
||||
} else {
|
||||
domain
|
||||
.commit_get_prefix(&rev)
|
||||
.map_err(|e| e.to_string())?
|
||||
.oid
|
||||
};
|
||||
|
||||
let entry = domain
|
||||
.tree_entry_by_path_from_commit(&commit_oid, path)
|
||||
.map_err(|e| e.to_string())?;
|
||||
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
|
||||
|
||||
let data = &content.content;
|
||||
if data.len() > MAX_FILE_SIZE {
|
||||
return Err(format!(
|
||||
"file too large ({} bytes), max {} bytes",
|
||||
data.len(),
|
||||
MAX_FILE_SIZE
|
||||
));
|
||||
}
|
||||
|
||||
// DOCX is a ZIP archive. Read word/document.xml from it.
|
||||
let cursor = std::io::Cursor::new(data);
|
||||
let mut archive = ZipArchive::new(cursor).map_err(|e| {
|
||||
format!(
|
||||
"failed to open docx as ZIP archive: {}. Make sure the file is a valid .docx document.",
|
||||
e
|
||||
)
|
||||
})?;
|
||||
|
||||
let doc_xml = {
|
||||
let file = if let Ok(f) = archive.by_name("word/document.xml") {
|
||||
f
|
||||
} else {
|
||||
archive.by_name("document.xml")
|
||||
.map_err(|_| "docx archive does not contain word/document.xml or document.xml")?
|
||||
};
|
||||
let mut s = String::new();
|
||||
let mut reader = std::io::BufReader::new(file);
|
||||
std::io::Read::read_to_string(&mut reader, &mut s)
|
||||
.map_err(|e| format!("failed to read document.xml: {}", e))?;
|
||||
s
|
||||
};
|
||||
|
||||
// Parse paragraphs from <w:p> elements
|
||||
let mut reader = Reader::from_str(&doc_xml);
|
||||
reader.config_mut().trim_text(false);
|
||||
|
||||
let mut paragraphs: Vec<String> = Vec::new();
|
||||
let mut buf = Vec::new();
|
||||
let mut in_paragraph = false;
|
||||
let mut current_text = String::new();
|
||||
|
||||
loop {
|
||||
match reader.read_event_into(&mut buf) {
|
||||
Ok(Event::Start(e)) => {
|
||||
if e.name().as_ref() == b"w:p" {
|
||||
in_paragraph = true;
|
||||
current_text.clear();
|
||||
}
|
||||
}
|
||||
Ok(Event::Text(e)) => {
|
||||
if in_paragraph {
|
||||
let txt = e.unescape().map(|s| s.into_owned()).unwrap_or_default();
|
||||
current_text.push_str(&txt);
|
||||
}
|
||||
}
|
||||
Ok(Event::End(e)) => {
|
||||
if e.name().as_ref() == b"w:p" && in_paragraph {
|
||||
in_paragraph = false;
|
||||
let text = current_text.trim().to_string();
|
||||
if !text.is_empty() {
|
||||
paragraphs.push(text);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(Event::Eof) => break,
|
||||
_ => {}
|
||||
}
|
||||
buf.clear();
|
||||
}
|
||||
|
||||
let total = paragraphs.len();
|
||||
|
||||
let body: Vec<serde_json::Value> = if sections_only {
|
||||
paragraphs
|
||||
.iter()
|
||||
.enumerate()
|
||||
.filter(|(_, text)| {
|
||||
text.chars().next().map(|c| c.is_uppercase()).unwrap_or(false)
|
||||
&& text.chars().filter(|&c| c == ' ').count() < text.len() / 2
|
||||
&& text.len() < 200
|
||||
})
|
||||
.skip(offset)
|
||||
.take(limit)
|
||||
.map(|(i, t)| serde_json::json!({ "index": i, "text": t }))
|
||||
.collect()
|
||||
} else {
|
||||
paragraphs
|
||||
.iter()
|
||||
.skip(offset)
|
||||
.take(limit)
|
||||
.enumerate()
|
||||
.map(|(i, t)| serde_json::json!({ "index": offset + i, "text": t }))
|
||||
.collect()
|
||||
};
|
||||
|
||||
Ok(serde_json::json!({
|
||||
"path": path,
|
||||
"rev": rev,
|
||||
"paragraph_count": total,
|
||||
"paragraphs": body,
|
||||
}))
|
||||
}
|
||||
|
||||
pub fn register_word_tools(registry: &mut ToolRegistry) {
|
||||
let p = HashMap::from([
|
||||
("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
|
||||
("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
|
||||
("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the .docx document".into()), required: true, properties: None, items: None }),
|
||||
("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }),
|
||||
("sections_only".into(), ToolParam { name: "sections_only".into(), param_type: "boolean".into(), description: Some("If true, extract only section/heading-like paragraphs (short lines starting with uppercase)".into()), required: false, properties: None, items: None }),
|
||||
("offset".into(), ToolParam { name: "offset".into(), param_type: "integer".into(), description: Some("Number of paragraphs to skip (default: 0)".into()), required: false, properties: None, items: None }),
|
||||
("limit".into(), ToolParam { name: "limit".into(), param_type: "integer".into(), description: Some("Maximum paragraphs to return (default: 200)".into()), required: false, properties: None, items: None }),
|
||||
]);
|
||||
let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
|
||||
registry.register(
|
||||
ToolDefinition::new("read_word")
|
||||
.description("Parse and extract text from Word documents (.docx). Returns paragraphs with index and text content. Supports pagination.")
|
||||
.parameters(schema),
|
||||
ToolHandler::new(|ctx, args| {
|
||||
let gctx = GitToolCtx::new(ctx);
|
||||
Box::pin(async move {
|
||||
read_word_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
|
||||
})
|
||||
}),
|
||||
);
|
||||
}
|
||||
@ -91,6 +91,38 @@ async fn git_file_history_exec(ctx: GitToolCtx, args: serde_json::Value) -> Resu
|
||||
Ok(serde_json::to_value(result).map_err(|e| e.to_string())?)
|
||||
}
|
||||
|
||||
async fn git_blob_get_exec(ctx: GitToolCtx, args: serde_json::Value) -> Result<serde_json::Value, String> {
|
||||
let p: serde_json::Map<String, serde_json::Value> = serde_json::from_value(args).map_err(|e| e.to_string())?;
|
||||
let project_name = p.get("project_name").and_then(|v| v.as_str()).ok_or("missing project_name")?;
|
||||
let repo_name = p.get("repo_name").and_then(|v| v.as_str()).ok_or("missing repo_name")?;
|
||||
let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
|
||||
let rev = p.get("rev").and_then(|v| v.as_str()).map(String::from).unwrap_or_else(|| "HEAD".to_string());
|
||||
|
||||
let domain = ctx.open_repo(project_name, repo_name).await?;
|
||||
let oid = if rev.len() >= 40 {
|
||||
git::commit::types::CommitOid::new(&rev)
|
||||
} else {
|
||||
domain.commit_get_prefix(&rev).map_err(|e| e.to_string())?.oid
|
||||
};
|
||||
|
||||
let entry = domain.tree_entry_by_path_from_commit(&oid, path).map_err(|e| e.to_string())?;
|
||||
let blob_info = domain.blob_get(&entry.oid).map_err(|e| e.to_string())?;
|
||||
|
||||
if blob_info.is_binary {
|
||||
return Err(format!("file '{}' is binary, cannot return as text", path));
|
||||
}
|
||||
|
||||
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
|
||||
let text = String::from_utf8_lossy(&content.content).to_string();
|
||||
|
||||
Ok(serde_json::json!({
|
||||
"path": path,
|
||||
"oid": entry.oid.to_string(),
|
||||
"size": blob_info.size,
|
||||
"content": text,
|
||||
}))
|
||||
}
|
||||
|
||||
fn flatten_commit(c: &git::commit::types::CommitMeta) -> serde_json::Value {
|
||||
use chrono::TimeZone;
|
||||
let ts = c.author.time_secs + (c.author.offset_minutes as i64 * 60);
|
||||
@ -162,4 +194,22 @@ pub fn register_git_tools(registry: &mut ToolRegistry) {
|
||||
})
|
||||
}),
|
||||
);
|
||||
|
||||
// git_blob_get
|
||||
let p = HashMap::from([
|
||||
("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
|
||||
("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
|
||||
("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path within the repository".into()), required: true, properties: None, items: None }),
|
||||
("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Revision to read file from (default: HEAD)".into()), required: false, properties: None, items: None }),
|
||||
]);
|
||||
let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
|
||||
registry.register(
|
||||
ToolDefinition::new("git_blob_get").description("Retrieve the raw content of a single file (blob) at a given revision. Returns error if the file is binary.").parameters(schema),
|
||||
ToolHandler::new(|ctx, args| {
|
||||
let gctx = super::ctx::GitToolCtx::new(ctx);
|
||||
Box::pin(async move {
|
||||
git_blob_get_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
|
||||
})
|
||||
}),
|
||||
);
|
||||
}
|
||||
@ -148,6 +148,7 @@ impl AppService {
|
||||
let client = async_openai::Client::with_config(cfg);
|
||||
let mut registry = ToolRegistry::new();
|
||||
git_tools::register_all(&mut registry);
|
||||
file_tools::register_all(&mut registry);
|
||||
Some(Arc::new(ChatService::new(client).with_tool_registry(registry)))
|
||||
}
|
||||
(Err(e), _) => {
|
||||
@ -229,6 +230,7 @@ pub mod auth;
|
||||
pub mod error;
|
||||
pub mod git;
|
||||
pub mod git_tools;
|
||||
pub mod file_tools;
|
||||
pub mod issue;
|
||||
pub mod project;
|
||||
pub mod pull_request;
|
||||
|
||||
Loading…
Reference in New Issue
Block a user