From 1af796ac759209e6a0c5a30afce4507a0d762cd6 Mon Sep 17 00:00:00 2001 From: ZhenYi <434836402@qq.com> Date: Sat, 18 Apr 2026 23:02:10 +0800 Subject: [PATCH] feat(service): add file_tools module and git_blob_get tool Add AI-accessible tools for reading structured files (CSV, JSON/JSONC, Markdown, SQL) and searching repository content (git_grep). Also adds git_blob_get to retrieve raw blob text content with binary detection. Deferred: Excel, Word, PDF, PPT modules are stubbed out due to library API incompatibilities (calamine 0.26, lopdf 0.34, quick-xml 0.37). --- Cargo.lock | 248 +++++++++++++++++++- Cargo.toml | 10 +- libs/service/Cargo.toml | 11 + libs/service/file_tools/csv.rs | 325 ++++++++++++++++++++++++++ libs/service/file_tools/excel.rs | 184 +++++++++++++++ libs/service/file_tools/grep.rs | 341 ++++++++++++++++++++++++++++ libs/service/file_tools/json.rs | 275 ++++++++++++++++++++++ libs/service/file_tools/markdown.rs | 286 +++++++++++++++++++++++ libs/service/file_tools/mod.rs | 39 ++++ libs/service/file_tools/pdf.rs | 244 ++++++++++++++++++++ libs/service/file_tools/ppt.rs | 204 +++++++++++++++++ libs/service/file_tools/sql.rs | 154 +++++++++++++ libs/service/file_tools/word.rs | 184 +++++++++++++++ libs/service/git_tools/tree.rs | 50 ++++ libs/service/lib.rs | 2 + 15 files changed, 2553 insertions(+), 4 deletions(-) create mode 100644 libs/service/file_tools/csv.rs create mode 100644 libs/service/file_tools/excel.rs create mode 100644 libs/service/file_tools/grep.rs create mode 100644 libs/service/file_tools/json.rs create mode 100644 libs/service/file_tools/markdown.rs create mode 100644 libs/service/file_tools/mod.rs create mode 100644 libs/service/file_tools/pdf.rs create mode 100644 libs/service/file_tools/ppt.rs create mode 100644 libs/service/file_tools/sql.rs create mode 100644 libs/service/file_tools/word.rs diff --git a/Cargo.lock b/Cargo.lock index 2dffe4b..b169829 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -614,11 +614,23 @@ dependencies = [ "num-traits", ] +[[package]] +name = "ar_archive_writer" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eb93bbb63b9c227414f6eb3a0adfddca591a8ce1e9b60661bb08969b87e340b" +dependencies = [ + "object", +] + [[package]] name = "arbitrary" version = "1.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3d036a3c4ab069c7b410a2ce876bd74808d2d0888a82667669f8e783a898bf1" +dependencies = [ + "derive_arbitrary", +] [[package]] name = "arc-swap" @@ -1357,6 +1369,21 @@ dependencies = [ "libbz2-rs-sys", ] +[[package]] +name = "calamine" +version = "0.26.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "138646b9af2c5d7f1804ea4bf93afc597737d2bd4f7341d67c48b03316976eb1" +dependencies = [ + "byteorder", + "codepage", + "encoding_rs", + "log", + "quick-xml 0.31.0", + "serde", + "zip 2.4.2", +] + [[package]] name = "captcha-rs" version = "0.5.0" @@ -1509,6 +1536,15 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "de0758edba32d61d1fd9f4d69491b47604b91ee2f7e6b33de7e54ca4ebe55dc3" +[[package]] +name = "codepage" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48f68d061bc2828ae826206326e61251aca94c1e4a5305cf52d9138639c918b4" +dependencies = [ + "encoding_rs", +] + [[package]] name = "color_quant" version = "1.1.0" @@ -1799,6 +1835,27 @@ dependencies = [ "hybrid-array", ] +[[package]] +name = "csv" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde_core", +] + +[[package]] +name = "csv-core" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" +dependencies = [ + "memchr", +] + [[package]] name = "ctr" version = "0.9.2" @@ -1989,6 +2046,17 @@ dependencies = [ "serde_core", ] +[[package]] +name = "derive_arbitrary" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e567bd82dcff979e4b03460c307b3cdc9e96fde3d73bed1496d2bc75d9dd62a" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.117", +] + [[package]] name = "derive_builder" version = "0.20.2" @@ -2676,6 +2744,15 @@ dependencies = [ "zeroize", ] +[[package]] +name = "getopts" +version = "0.2.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df" +dependencies = [ + "unicode-width", +] + [[package]] name = "getrandom" version = "0.2.17" @@ -2786,7 +2863,7 @@ dependencies = [ "tokio", "tokio-util", "uuid", - "zip", + "zip 8.4.0", ] [[package]] @@ -4203,6 +4280,26 @@ dependencies = [ "imgref", ] +[[package]] +name = "lopdf" +version = "0.34.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5c8ecfc6c72051981c0459f75ccc585e7ff67c70829560cda8e647882a9abff" +dependencies = [ + "chrono", + "encoding_rs", + "flate2", + "indexmap 2.13.0", + "itoa", + "log", + "md-5", + "nom 7.1.3", + "rangemap", + "rayon", + "time", + "weezl", +] + [[package]] name = "lru" version = "0.12.5" @@ -4698,6 +4795,15 @@ dependencies = [ "objc2-core-foundation", ] +[[package]] +name = "object" +version = "0.37.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" +dependencies = [ + "memchr", +] + [[package]] name = "once_cell" version = "1.21.4" @@ -5424,6 +5530,16 @@ dependencies = [ "prost", ] +[[package]] +name = "psm" +version = "0.1.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3852766467df634d74f0b2d7819bf8dc483a0eb2e3b0f50f756f9cfe8b0d18d8" +dependencies = [ + "ar_archive_writer", + "cc", +] + [[package]] name = "ptr_meta" version = "0.1.4" @@ -5444,6 +5560,25 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "pulldown-cmark" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f86ba2052aebccc42cbbb3ed234b8b13ce76f75c3551a303cb2bcffcff12bb14" +dependencies = [ + "bitflags", + "getopts", + "memchr", + "pulldown-cmark-escape", + "unicase", +] + +[[package]] +name = "pulldown-cmark-escape" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "007d8adb5ddab6f8e3f491ac63566a7d5002cc7ed73901f72057943fa71ae1ae" + [[package]] name = "pxfm" version = "0.1.28" @@ -5505,6 +5640,25 @@ version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a993555f31e5a609f617c12db6250dedcac1b0a85076912c436e6fc9b2c8e6a3" +[[package]] +name = "quick-xml" +version = "0.31.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1004a344b30a54e2ee58d66a71b32d2db2feb0a31f9a2d302bf0536f15de2a33" +dependencies = [ + "encoding_rs", + "memchr", +] + +[[package]] +name = "quick-xml" +version = "0.37.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb" +dependencies = [ + "memchr", +] + [[package]] name = "quinn" version = "0.11.9" @@ -5679,6 +5833,12 @@ dependencies = [ "rand 0.9.2", ] +[[package]] +name = "rangemap" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "973443cf09a9c8656b574a866ab68dfa19f0867d0340648c7d2f6a71b8a8ea68" + [[package]] name = "rav1e" version = "0.8.1" @@ -5755,6 +5915,26 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "recursive" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0786a43debb760f491b1bc0269fe5e84155353c67482b9e60d0cfb596054b43e" +dependencies = [ + "recursive-proc-macro-impl", + "stacker", +] + +[[package]] +name = "recursive-proc-macro-impl" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76009fbe0614077fc1a2ce255e3a1881a2e3a3527097d5dc6d8212c585e7e38b" +dependencies = [ + "quote", + "syn 2.0.117", +] + [[package]] name = "redis" version = "1.1.0" @@ -6745,22 +6925,29 @@ dependencies = [ "async-openai", "avatar", "base64 0.22.1", + "calamine", "captcha-rs", "chrono", "config", + "csv", "db", "deadpool-redis", "email", + "flate2", "futures", "git", "git2", "hex", "hmac", + "lopdf", "models", "moka", + "pulldown-cmark", "queue", + "quick-xml 0.37.5", "rand 0.10.0", "redis", + "regex", "reqwest 0.13.2", "room", "rsa", @@ -6772,11 +6959,15 @@ dependencies = [ "sha1", "sha2 0.11.0", "slog", + "sqlparser", + "tempfile", "tokio", "tokio-stream", "tracing", "utoipa", "uuid", + "walkdir", + "zip 8.4.0", ] [[package]] @@ -6992,6 +7183,16 @@ dependencies = [ "der", ] +[[package]] +name = "sqlparser" +version = "0.55.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4521174166bac1ff04fe16ef4524c70144cd29682a45978978ca3d7f4e0be11" +dependencies = [ + "log", + "recursive", +] + [[package]] name = "sqlx" version = "0.8.6" @@ -7275,6 +7476,19 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "stacker" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d74a23609d509411d10e2176dc2a4346e3b4aea2e7b1869f19fdedbc71c013" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.59.0", +] + [[package]] name = "static-server" version = "0.2.9" @@ -7936,6 +8150,12 @@ version = "1.13.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9629274872b2bfaf8d66f5f15725007f635594914870f65218920345aa11aa8c" +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + [[package]] name = "unicode-xid" version = "0.2.6" @@ -8504,6 +8724,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-sys" version = "0.60.2" @@ -8974,6 +9203,23 @@ dependencies = [ "syn 2.0.117", ] +[[package]] +name = "zip" +version = "2.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabe6324e908f85a1c52063ce7aa26b68dcb7eb6dbc83a2d148403c9bc3eba50" +dependencies = [ + "arbitrary", + "crc32fast", + "crossbeam-utils", + "displaydoc", + "flate2", + "indexmap 2.13.0", + "memchr", + "thiserror 2.0.18", + "zopfli", +] + [[package]] name = "zip" version = "8.4.0" diff --git a/Cargo.toml b/Cargo.toml index e143985..d7aa8f7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -142,6 +142,12 @@ hostname = "0.4" utoipa = { version = "5.4.0", features = ["chrono", "uuid"] } rust_decimal = "1.40.0" walkdir = "2.5.0" +calamine = "0.26" +csv = "1.3" +lopdf = "0.34" +pulldown-cmark = "0.12" +quick-xml = "0.37" +sqlparser = "0.55" lazy_static = "1.5" moka = "0.12.15" serde = "1.0.228" @@ -151,9 +157,7 @@ serde_bytes = "0.11.19" phf = "0.13.1" phf_codegen = "0.13.1" base64 = "0.22.1" - - - +tempfile = "3" [workspace.package] version = "0.2.9" diff --git a/libs/service/Cargo.toml b/libs/service/Cargo.toml index 033790e..6e98824 100644 --- a/libs/service/Cargo.toml +++ b/libs/service/Cargo.toml @@ -54,6 +54,17 @@ futures = { workspace = true } deadpool-redis = { workspace = true, features = ["rt_tokio_1", "cluster-async", "cluster"] } moka = { workspace = true, features = ["future"] } rust_decimal = { workspace = true } +calamine = { workspace = true } +csv = { workspace = true } +quick-xml = { workspace = true } +lopdf = { workspace = true } +pulldown-cmark = { workspace = true } +sqlparser = { workspace = true } +walkdir = { workspace = true } +zip = { workspace = true } +regex = { workspace = true } +flate2 = { workspace = true } +tempfile = { workspace = true } [lints] workspace = true diff --git a/libs/service/file_tools/csv.rs b/libs/service/file_tools/csv.rs new file mode 100644 index 0000000..78db2f9 --- /dev/null +++ b/libs/service/file_tools/csv.rs @@ -0,0 +1,325 @@ +//! read_csv — parse and query CSV files. + +use crate::file_tools::MAX_FILE_SIZE; +use crate::git_tools::ctx::GitToolCtx; +use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema}; +use csv::ReaderBuilder; +use std::collections::HashMap; + +async fn read_csv_exec( + ctx: GitToolCtx, + args: serde_json::Value, +) -> Result { + let p: serde_json::Map = + serde_json::from_value(args).map_err(|e| e.to_string())?; + + let project_name = p + .get("project_name") + .and_then(|v| v.as_str()) + .ok_or("missing project_name")?; + let repo_name = p + .get("repo_name") + .and_then(|v| v.as_str()) + .ok_or("missing repo_name")?; + let path = p + .get("path") + .and_then(|v| v.as_str()) + .ok_or("missing path")?; + let rev = p + .get("rev") + .and_then(|v| v.as_str()) + .map(String::from) + .unwrap_or_else(|| "HEAD".to_string()); + let delimiter = p + .get("delimiter") + .and_then(|v| v.as_str()) + .and_then(|s| s.chars().next()) + .unwrap_or(','); + let has_header = p + .get("has_header") + .and_then(|v| v.as_bool()) + .unwrap_or(true); + let offset = p.get("offset").and_then(|v| v.as_u64()).unwrap_or(0) as usize; + let limit = p.get("limit").and_then(|v| v.as_u64()).unwrap_or(100) as usize; + let filter_col = p.get("filter_column").and_then(|v| v.as_str()); + let filter_val = p.get("filter_value").and_then(|v| v.as_str()); + let select_cols = p.get("columns").and_then(|v| v.as_array()).map(|a| { + a.iter() + .filter_map(|v| v.as_str().map(String::from)) + .collect::>() + }); + + let domain = ctx.open_repo(project_name, repo_name).await?; + + let commit_oid = if rev.len() >= 40 { + git::commit::types::CommitOid::new(&rev) + } else { + domain + .commit_get_prefix(&rev) + .map_err(|e| e.to_string())? + .oid + }; + + let entry = domain + .tree_entry_by_path_from_commit(&commit_oid, path) + .map_err(|e| e.to_string())?; + let blob = domain.blob_get(&entry.oid).map_err(|e| e.to_string())?; + + if blob.is_binary { + return Err("file is binary, not a CSV".to_string()); + } + + let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?; + let data = &content.content; + if data.len() > MAX_FILE_SIZE { + return Err(format!( + "file too large ({} bytes), max {} bytes", + data.len(), + MAX_FILE_SIZE + )); + } + + let text = String::from_utf8_lossy(data); + let mut reader = ReaderBuilder::new() + .delimiter(delimiter as u8) + .has_headers(has_header) + .from_reader(text.as_bytes()); + + let headers: Vec = if has_header { + reader + .headers() + .map_err(|e| e.to_string())? + .clone() + .into_iter() + .map(String::from) + .collect() + } else { + vec![] + }; + + let col_indices: Vec = if let Some(ref sel) = select_cols { + sel.iter() + .filter_map(|col| headers.iter().position(|h| h == col)) + .collect() + } else { + (0..headers.len()).collect() + }; + + let _col_set: std::collections::HashSet = col_indices.iter().cloned().collect(); + let filter_col_idx = filter_col.and_then(|c| headers.iter().position(|h| h == c)); + + let mut rows: Vec = Vec::new(); + let mut skipped = 0; + let mut total = 0; + + for result in reader.records() { + let record = result.map_err(|e| e.to_string())?; + + // Skip offset + if skipped < offset { + skipped += 1; + continue; + } + + total += 1; + + // Filter + if let (Some(fci), Some(fv)) = (filter_col_idx, filter_val) { + if record.get(fci) != Some(fv) { + continue; + } + } + + // Select columns + let obj = if has_header { + let mut map = serde_json::Map::new(); + for &idx in &col_indices { + let key = headers + .get(idx) + .cloned() + .unwrap_or_else(|| format!("col_{}", idx)); + let val = record.get(idx).unwrap_or("").to_string(); + map.insert(key, serde_json::Value::String(val)); + } + serde_json::Value::Object(map) + } else { + let arr: Vec = col_indices + .iter() + .map(|&idx| record.get(idx).unwrap_or("").to_string()) + .collect(); + serde_json::Value::Array(arr.into_iter().map(serde_json::Value::String).collect()) + }; + + rows.push(obj); + + if rows.len() >= limit { + break; + } + } + + Ok(serde_json::json!({ + "path": path, + "rev": rev, + "headers": if has_header { headers } else { vec![] }, + "selected_columns": select_cols, + "rows": rows, + "row_count": rows.len(), + "total_available": total + offset, + "filter": if let (Some(c), Some(v)) = (filter_col, filter_val) { + serde_json::json!({ "column": c, "value": v }) + } else { serde_json::Value::Null }, + })) +} + +pub fn register_csv_tools(registry: &mut ToolRegistry) { + let p = HashMap::from([ + ( + "project_name".into(), + ToolParam { + name: "project_name".into(), + param_type: "string".into(), + description: Some("Project name (slug)".into()), + required: true, + properties: None, + items: None, + }, + ), + ( + "repo_name".into(), + ToolParam { + name: "repo_name".into(), + param_type: "string".into(), + description: Some("Repository name".into()), + required: true, + properties: None, + items: None, + }, + ), + ( + "path".into(), + ToolParam { + name: "path".into(), + param_type: "string".into(), + description: Some("File path within the repository".into()), + required: true, + properties: None, + items: None, + }, + ), + ( + "rev".into(), + ToolParam { + name: "rev".into(), + param_type: "string".into(), + description: Some("Git revision (default: HEAD)".into()), + required: false, + properties: None, + items: None, + }, + ), + ( + "delimiter".into(), + ToolParam { + name: "delimiter".into(), + param_type: "string".into(), + description: Some("Field delimiter character (default: comma \",\")".into()), + required: false, + properties: None, + items: None, + }, + ), + ( + "has_header".into(), + ToolParam { + name: "has_header".into(), + param_type: "boolean".into(), + description: Some("If true, first row is column headers (default: true)".into()), + required: false, + properties: None, + items: None, + }, + ), + ( + "columns".into(), + ToolParam { + name: "columns".into(), + param_type: "array".into(), + description: Some("List of column names to select".into()), + required: false, + properties: None, + items: Some(Box::new(ToolParam { + name: "".into(), + param_type: "string".into(), + description: None, + required: false, + properties: None, + items: None, + })), + }, + ), + ( + "filter_column".into(), + ToolParam { + name: "filter_column".into(), + param_type: "string".into(), + description: Some("Column name to filter by".into()), + required: false, + properties: None, + items: None, + }, + ), + ( + "filter_value".into(), + ToolParam { + name: "filter_value".into(), + param_type: "string".into(), + description: Some("Value to match in filter_column".into()), + required: false, + properties: None, + items: None, + }, + ), + ( + "offset".into(), + ToolParam { + name: "offset".into(), + param_type: "integer".into(), + description: Some("Number of rows to skip (default: 0)".into()), + required: false, + properties: None, + items: None, + }, + ), + ( + "limit".into(), + ToolParam { + name: "limit".into(), + param_type: "integer".into(), + description: Some("Maximum rows to return (default: 100)".into()), + required: false, + properties: None, + items: None, + }, + ), + ]); + let schema = ToolSchema { + schema_type: "object".into(), + properties: Some(p), + required: Some(vec![ + "project_name".into(), + "repo_name".into(), + "path".into(), + ]), + }; + registry.register( + ToolDefinition::new("read_csv") + .description("Parse and query a CSV file. Supports header detection, column selection, filtering, pagination (offset/limit), and custom delimiters.") + .parameters(schema), + ToolHandler::new(|ctx, args| { + let gctx = GitToolCtx::new(ctx); + Box::pin(async move { + read_csv_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError) + }) + }), + ); +} diff --git a/libs/service/file_tools/excel.rs b/libs/service/file_tools/excel.rs new file mode 100644 index 0000000..3b32b87 --- /dev/null +++ b/libs/service/file_tools/excel.rs @@ -0,0 +1,184 @@ +//! read_excel — parse and query Excel files (.xlsx, .xls). + +use crate::file_tools::MAX_FILE_SIZE; +use crate::git_tools::ctx::GitToolCtx; +use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema}; +use calamine::{open_workbook, Reader, Xlsx}; +use futures::FutureExt; +use std::collections::HashMap; + +async fn read_excel_exec( + ctx: GitToolCtx, + args: serde_json::Value, +) -> Result { + let p: serde_json::Map = + serde_json::from_value(args).map_err(|e| e.to_string())?; + + let project_name = p + .get("project_name") + .and_then(|v| v.as_str()) + .ok_or("missing project_name")?; + let repo_name = p + .get("repo_name") + .and_then(|v| v.as_str()) + .ok_or("missing repo_name")?; + let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?; + let rev = p + .get("rev") + .and_then(|v| v.as_str()) + .map(String::from) + .unwrap_or_else(|| "HEAD".to_string()); + let sheet_name = p.get("sheet_name").and_then(|v| v.as_str()).map(String::from); + let sheet_index = p.get("sheet_index").and_then(|v| v.as_u64()).map(|v| v as usize); + let offset = p.get("offset").and_then(|v| v.as_u64()).unwrap_or(0) as usize; + let limit = p + .get("limit") + .and_then(|v| v.as_u64()) + .unwrap_or(100) as usize; + let has_header = p + .get("has_header") + .and_then(|v| v.as_bool()) + .unwrap_or(true); + + let domain = ctx.open_repo(project_name, repo_name).await?; + + let commit_oid = if rev.len() >= 40 { + git::commit::types::CommitOid::new(&rev) + } else { + domain + .commit_get_prefix(&rev) + .map_err(|e| e.to_string())? + .oid + }; + + let entry = domain + .tree_entry_by_path_from_commit(&commit_oid, path) + .map_err(|e| e.to_string())?; + let blob = domain.blob_get(&entry.oid).map_err(|e| e.to_string())?; + let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?; + + let data = &content.content; + if data.len() > MAX_FILE_SIZE { + return Err(format!( + "file too large ({} bytes), max {} bytes", + data.len(), + MAX_FILE_SIZE + )); + } + + // Use cursor-based reading to avoid tempfile + let cursor = std::io::Cursor::new(data.clone()); + let mut workbook: Xlsx>> = + open_workbook(cursor).map_err(|e| format!("failed to open Excel: {}", e))?; + + let sheet_names = workbook.sheet_names().to_vec(); + + // Determine which sheet to read + let sheet_idx = match (sheet_name.clone(), sheet_index) { + (Some(name), _) => sheet_names + .iter() + .position(|n| n == &name) + .ok_or_else(|| format!("sheet '{}' not found. Available: {:?}", name, sheet_names))?, + (_, Some(idx)) => { + if idx >= sheet_names.len() { + return Err(format!( + "sheet index {} out of range (0..{})", + idx, + sheet_names.len() + )); + } + idx + } + _ => 0, + }; + + let range = workbook + .worksheet_range_at(sheet_idx) + .map_err(|e| format!("failed to read sheet: {}", e))?; + + let rows: Vec> = range + .rows() + .skip(if has_header { offset + 1 } else { offset }) + .take(limit) + .map(|row| { + row.iter() + .map(|cell| { + use calamine::Data; + match cell { + Data::Int(i) => serde_json::Value::Number((*i).into()), + Data::Float(f) => { + serde_json::json!(f) + } + Data::String(s) => serde_json::Value::String(s.clone()), + Data::Bool(b) => serde_json::Value::Bool(*b), + Data::DateTime(dt) => { + serde_json::Value::String(format!("{:?}", dt)) + } + Data::DateTimeIso(s) => serde_json::Value::String(s.clone()), + Data::DurationIso(s) => serde_json::Value::String(s.clone()), + Data::Error(e) => serde_json::json!({ "error": format!("{:?}", e) }), + Data::Empty => serde_json::Value::Null, + } + }) + .collect() + }) + .collect(); + + let header_row: Vec = if has_header { + range + .rows() + .next() + .map(|row| { + row.iter() + .map(|c| { + if let calamine::Data::String(s) = c { + s.clone() + } else { + String::new() + } + }) + .collect() + }) + .unwrap_or_default() + } else { + vec![] + }; + + Ok(serde_json::json!({ + "path": path, + "rev": rev, + "sheets": sheet_names, + "active_sheet": sheet_names.get(sheet_idx).cloned(), + "sheet_index": sheet_idx, + "headers": header_row, + "rows": rows, + "row_count": rows.len(), + "total_rows": range.rows().count().saturating_sub(if has_header { 1 } else { 0 }), + })) +} + +pub fn register_excel_tools(registry: &mut ToolRegistry) { + let p = HashMap::from([ + ("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }), + ("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }), + ("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path within the repository (supports .xlsx, .xls)".into()), required: true, properties: None, items: None }), + ("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }), + ("sheet_name".into(), ToolParam { name: "sheet_name".into(), param_type: "string".into(), description: Some("Sheet name to read. Defaults to first sheet.".into()), required: false, properties: None, items: None }), + ("sheet_index".into(), ToolParam { name: "sheet_index".into(), param_type: "integer".into(), description: Some("Sheet index (0-based). Ignored if sheet_name is set.".into()), required: false, properties: None, items: None }), + ("has_header".into(), ToolParam { name: "has_header".into(), param_type: "boolean".into(), description: Some("If true, first row is column headers (default: true)".into()), required: false, properties: None, items: None }), + ("offset".into(), ToolParam { name: "offset".into(), param_type: "integer".into(), description: Some("Number of rows to skip (default: 0)".into()), required: false, properties: None, items: None }), + ("limit".into(), ToolParam { name: "limit".into(), param_type: "integer".into(), description: Some("Maximum rows to return (default: 100)".into()), required: false, properties: None, items: None }), + ]); + let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) }; + registry.register( + ToolDefinition::new("read_excel") + .description("Parse and query Excel spreadsheets (.xlsx, .xls). Returns sheet names, headers, and rows with support for sheet selection and pagination.") + .parameters(schema), + ToolHandler::new(|ctx, args| { + let gctx = GitToolCtx::new(ctx); + Box::pin(async move { + read_excel_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError) + }) + }), + ); +} diff --git a/libs/service/file_tools/grep.rs b/libs/service/file_tools/grep.rs new file mode 100644 index 0000000..95ebca7 --- /dev/null +++ b/libs/service/file_tools/grep.rs @@ -0,0 +1,341 @@ +//! git_grep — search repository files for patterns. + +use crate::file_tools::MAX_FILE_SIZE; +use crate::git_tools::ctx::GitToolCtx; +use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema}; +use regex::RegexBuilder; +use std::collections::HashMap; + +/// Text file extensions to search (skip binary files). +const TEXT_EXTS: &[&str] = &[ + "rs", "toml", "yaml", "yml", "json", "jsonc", "js", "jsx", "ts", "tsx", + "css", "scss", "less", "html", "htm", "xml", "svg", "vue", "svelte", + "py", "rb", "go", "java", "kt", "swift", "c", "cpp", "h", "hpp", + "cs", "php", "pl", "sh", "bash", "zsh", "fish", "ps1", "bat", "cmd", + "sql", "md", "markdown", "rst", "txt", "log", "ini", "cfg", "conf", + "dockerfile", "makefile", "cmake", "gradle", "properties", "env", + "proto", "graphql", "vue", "lock", +]; + +fn is_text_ext(path: &str) -> bool { + let lower = path.to_lowercase(); + TEXT_EXTS.iter().any(|&e| lower.ends_with(&format!(".{}", e))) +} + +fn is_binary_content(data: &[u8]) -> bool { + data.iter().take(8192).any(|&b| b == 0) +} + +async fn git_grep_exec( + ctx: GitToolCtx, + args: serde_json::Value, +) -> Result { + let p: serde_json::Map = + serde_json::from_value(args).map_err(|e| e.to_string())?; + + let project_name = p + .get("project_name") + .and_then(|v| v.as_str()) + .ok_or("missing project_name")?; + let repo_name = p + .get("repo_name") + .and_then(|v| v.as_str()) + .ok_or("missing repo_name")?; + let rev = p + .get("rev") + .and_then(|v| v.as_str()) + .map(String::from) + .unwrap_or_else(|| "HEAD".to_string()); + let pattern = p + .get("pattern") + .and_then(|v| v.as_str()) + .ok_or("missing pattern")?; + let glob = p.get("glob").and_then(|v| v.as_str()).map(String::from); + let is_regex = p + .get("is_regex") + .and_then(|v| v.as_bool()) + .unwrap_or(true); + let context_lines = p + .get("context_lines") + .and_then(|v| v.as_u64()) + .unwrap_or(0) as usize; + let max_results = p + .get("max_results") + .and_then(|v| v.as_u64()) + .unwrap_or(100) as usize; + + let domain = ctx.open_repo(project_name, repo_name).await?; + + // Resolve revision to commit oid + let commit_oid = if rev.len() >= 40 { + git::commit::types::CommitOid::new(&rev) + } else { + domain + .commit_get_prefix(&rev) + .map_err(|e| e.to_string())? + .oid + }; + + let regex = if is_regex { + RegexBuilder::new(pattern) + .case_insensitive(true) + .build() + .map_err(|e| format!("invalid regex '{}': {}", pattern, e))? + } else { + // Escape for literal search + RegexBuilder::new(®ex::escape(pattern)) + .case_insensitive(true) + .build() + .map_err(|e| e.to_string())? + }; + + // Recursive tree walk using git2 + let repo = domain.repo(); + let commit = repo + .find_commit(commit_oid.to_oid().map_err(|e| e.to_string())?) + .map_err(|e| e.to_string())?; + let tree = commit.tree().map_err(|e| e.to_string())?; + + let mut results: Vec = Vec::new(); + // Stack: (tree, current_path_prefix) + let mut stack: Vec<(git2::Tree<'_>, String)> = vec![(tree, String::new())]; + + while let Some((current_tree, current_prefix)) = stack.pop() { + for entry in current_tree.iter() { + let name = entry.name().unwrap_or_default(); + if name.is_empty() { + continue; + } + let path: String = if current_prefix.is_empty() { + name.to_string() + } else { + format!("{}/{}", current_prefix, name) + }; + + if entry.kind() == Some(git2::ObjectType::Tree) { + if let Some(subtree) = entry.to_object(&repo).ok().and_then(|o| o.into_tree().ok()) { + stack.push((subtree, path)); + } + continue; + } + + if entry.kind() != Some(git2::ObjectType::Blob) { + continue; + } + + // Glob filter + if let Some(ref g) = glob { + if !glob_match(&path, g) { + continue; + } + } else if !is_text_ext(&path) { + continue; + } + + // Read blob content + let blob = match entry.to_object(&repo).ok().and_then(|o| o.into_blob().ok()) { + Some(b) => b, + None => continue, + }; + + let size = blob.size(); + if size == 0 || size > MAX_FILE_SIZE { + continue; + } + + let data = blob.content(); + if is_binary_content(data) { + continue; + } + + let content = match String::from_utf8(data.to_vec()) { + Ok(s) => s, + Err(_) => continue, + }; + + // Search line by line + let lines: Vec<&str> = content.lines().collect(); + for (line_idx, line) in lines.iter().enumerate() { + if regex.is_match(line) { + let start = line_idx.saturating_sub(context_lines); + let end = (line_idx + context_lines + 1).min(lines.len()); + + let context: Vec = lines[start..end] + .iter() + .enumerate() + .map(|(i, l)| { + let line_num = start + i + 1; + let prefix = if start + i == line_idx { ">" } else { " " }; + format!("{}{}: {}", prefix, line_num, l) + }) + .collect(); + + results.push(serde_json::json!({ + "file": path, + "line_number": line_idx + 1, + "match": line, + "context": context.join("\n"), + })); + + if results.len() >= max_results { + return Ok(serde_json::json!({ + "query": pattern, + "rev": rev, + "total_matches": results.len(), + "truncated": true, + "results": results + })); + } + } + } + } + } + + Ok(serde_json::json!({ + "query": pattern, + "rev": rev, + "total_matches": results.len(), + "truncated": false, + "results": results + })) +} + +fn glob_match(path: &str, pattern: &str) -> bool { + // Simple glob: support *, ?, ** + let parts: Vec<&str> = pattern.split('/').collect(); + let path_parts: Vec<&str> = path.split('/').collect(); + let _path_lower = path.to_lowercase(); + let pattern_lower = pattern.to_lowercase(); + + fn matches_part(path_part: &str, pattern_part: &str) -> bool { + if pattern_part.is_empty() || pattern_part == "*" { + return true; + } + if pattern_part == "**" { + return true; + } + if let Some(star) = pattern_part.find('*') { + let (prefix, suffix) = pattern_part.split_at(star); + let suffix = if suffix.starts_with('*') { &suffix[1..] } else { suffix }; + if !prefix.is_empty() && !path_part.starts_with(prefix) { + return false; + } + if !suffix.is_empty() && !path_part.ends_with(suffix) { + return false; + } + return true; + } + path_part == pattern_part + } + + if parts.len() == 1 { + // Simple glob pattern on filename only + let file_name = path_parts.last().unwrap_or(&""); + return matches_part(file_name, &pattern_lower); + } + + // Multi-part glob + let mut pi = 0; + for part in &parts { + while pi < path_parts.len() { + if matches_part(path_parts[pi], part) { + pi += 1; + break; + } + if *part != "**" { + return false; + } + pi += 1; + } + } + true +} + +pub fn register_grep_tools(registry: &mut ToolRegistry) { + let p = HashMap::from([ + ("project_name".into(), ToolParam { + name: "project_name".into(), + param_type: "string".into(), + description: Some("Project name (slug)".into()), + required: true, + properties: None, + items: None, + }), + ("repo_name".into(), ToolParam { + name: "repo_name".into(), + param_type: "string".into(), + description: Some("Repository name".into()), + required: true, + properties: None, + items: None, + }), + ("pattern".into(), ToolParam { + name: "pattern".into(), + param_type: "string".into(), + description: Some("Search pattern (regex or literal string)".into()), + required: true, + properties: None, + items: None, + }), + ("rev".into(), ToolParam { + name: "rev".into(), + param_type: "string".into(), + description: Some("Git revision to search in (branch, tag, commit). Default: HEAD".into()), + required: false, + properties: None, + items: None, + }), + ("glob".into(), ToolParam { + name: "glob".into(), + param_type: "string".into(), + description: Some("File glob pattern to filter (e.g. *.rs, src/**/*.ts)".into()), + required: false, + properties: None, + items: None, + }), + ("is_regex".into(), ToolParam { + name: "is_regex".into(), + param_type: "boolean".into(), + description: Some("If true, pattern is a regex. If false, literal string. Default: true".into()), + required: false, + properties: None, + items: None, + }), + ("context_lines".into(), ToolParam { + name: "context_lines".into(), + param_type: "integer".into(), + description: Some("Number of surrounding lines to include for each match. Default: 0".into()), + required: false, + properties: None, + items: None, + }), + ("max_results".into(), ToolParam { + name: "max_results".into(), + param_type: "integer".into(), + description: Some("Maximum number of matches to return. Default: 100".into()), + required: false, + properties: None, + items: None, + }), + ]); + + let schema = ToolSchema { + schema_type: "object".into(), + properties: Some(p), + required: Some(vec!["project_name".into(), "repo_name".into(), "pattern".into()]), + }; + + registry.register( + ToolDefinition::new("git_grep") + .description("Search for a text pattern across all files in a repository at a given revision. Supports regex, glob filtering, and line-level context. Skips binary files automatically.") + .parameters(schema), + ToolHandler::new(|ctx, args| { + let gctx = GitToolCtx::new(ctx); + Box::pin(async move { + git_grep_exec(gctx, args) + .await + .map_err(agent::ToolError::ExecutionError) + }) + }), + ); +} diff --git a/libs/service/file_tools/json.rs b/libs/service/file_tools/json.rs new file mode 100644 index 0000000..6a6b2f3 --- /dev/null +++ b/libs/service/file_tools/json.rs @@ -0,0 +1,275 @@ +//! read_json — parse, validate, and query JSON / JSONC files. + +use crate::file_tools::MAX_FILE_SIZE; +use crate::git_tools::ctx::GitToolCtx; +use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema}; +use serde_json::Value as JsonValue; +use std::collections::HashMap; + +/// Remove comments from JSONC (lines starting with // or /* */) for parsing. +fn strip_jsonc_comments(input: &str) -> String { + let mut result = String::with_capacity(input.len()); + let mut chars = input.chars().peekable(); + let mut in_string = false; + let mut escaped = false; + + while let Some(c) = chars.next() { + if escaped { + result.push(c); + escaped = false; + continue; + } + + if c == '\\' && in_string { + result.push(c); + escaped = true; + continue; + } + + if c == '"' { + result.push(c); + in_string = !in_string; + continue; + } + + if !in_string { + if c == '/' { + if let Some(&next) = chars.peek() { + if next == '/' { + // Line comment — skip to end of line + chars.next(); + while let Some(nc) = chars.next() { + if nc == '\n' { + result.push(nc); + break; + } + } + continue; + } else if next == '*' { + // Block comment — skip until */ + chars.next(); + while let Some(nc) = chars.next() { + if nc == '*' { + if let Some(&'/') = chars.peek() { + chars.next(); + break; + } + } + } + continue; + } + } + } + } + + result.push(c); + } + + result +} + +fn infer_schema(value: &JsonValue, max_depth: usize) -> JsonValue { + if max_depth == 0 { + return serde_json::json!({ "type": "MAX_DEPTH" }); + } + + match value { + JsonValue::Null => serde_json::json!({ "type": "null" }), + JsonValue::Bool(_) => serde_json::json!({ "type": "boolean" }), + JsonValue::Number(_) => serde_json::json!({ "type": "number" }), + JsonValue::String(_) => serde_json::json!({ "type": "string" }), + JsonValue::Array(arr) => { + if arr.is_empty() { + serde_json::json!({ "type": "array", "items": null }) + } else { + serde_json::json!({ + "type": "array", + "length": arr.len(), + "items": infer_schema(&arr[0], max_depth - 1) + }) + } + } + JsonValue::Object(obj) => { + let mut schema = serde_json::Map::new(); + schema.insert("type".into(), serde_json::Value::String("object".into())); + let mut properties = serde_json::Map::new(); + for (k, v) in obj { + properties.insert(k.clone(), infer_schema(v, max_depth - 1)); + } + schema.insert("properties".into(), serde_json::Value::Object(properties)); + schema.insert("keyCount".into(), serde_json::json!(obj.len())); + serde_json::Value::Object(schema) + } + } +} + +async fn read_json_exec( + ctx: GitToolCtx, + args: serde_json::Value, +) -> Result { + let p: serde_json::Map = + serde_json::from_value(args).map_err(|e| e.to_string())?; + + let project_name = p + .get("project_name") + .and_then(|v| v.as_str()) + .ok_or("missing project_name")?; + let repo_name = p + .get("repo_name") + .and_then(|v| v.as_str()) + .ok_or("missing repo_name")?; + let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?; + let rev = p + .get("rev") + .and_then(|v| v.as_str()) + .map(String::from) + .unwrap_or_else(|| "HEAD".to_string()); + let query = p.get("query").and_then(|v| v.as_str()).map(String::from); + let max_depth = p.get("schema_depth").and_then(|v| v.as_u64()).unwrap_or(4) as usize; + let pretty = p.get("pretty").and_then(|v| v.as_bool()).unwrap_or(false); + + let domain = ctx.open_repo(project_name, repo_name).await?; + + let commit_oid = if rev.len() >= 40 { + git::commit::types::CommitOid::new(&rev) + } else { + domain + .commit_get_prefix(&rev) + .map_err(|e| e.to_string())? + .oid + }; + + let entry = domain + .tree_entry_by_path_from_commit(&commit_oid, path) + .map_err(|e| e.to_string())?; + let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?; + + let data = &content.content; + if data.len() > MAX_FILE_SIZE { + return Err(format!( + "file too large ({} bytes), max {} bytes", + data.len(), + MAX_FILE_SIZE + )); + } + + let text = String::from_utf8_lossy(data); + let is_jsonc = path.ends_with(".jsonc") || path.ends_with(".vscodeignore") || text.contains("//"); + + let json_text = if is_jsonc { + strip_jsonc_comments(&text) + } else { + text.to_string() + }; + + let parsed: JsonValue = serde_json::from_str(&json_text) + .map_err(|e| format!("JSON parse error at {}: {}", e.line(), e))?; + + // Apply JSONPath-like query + let result = if let Some(ref q) = query { + query_json(&parsed, q)? + } else { + parsed + }; + + let schema = infer_schema(&result, max_depth); + + let display = if pretty { + serde_json::to_string_pretty(&result).unwrap_or_default() + } else { + serde_json::to_string(&result).unwrap_or_default() + }; + + Ok(serde_json::json!({ + "path": path, + "rev": rev, + "format": if is_jsonc { "jsonc" } else { "json" }, + "size_bytes": data.len(), + "schema": schema, + "data": if display.chars().count() > 5000 { + format!("{}... (truncated, {} chars total)", &display[..5000], display.chars().count()) + } else { display }, + })) +} + +/// Simple JSONPath-like query support. +/// Supports: $.key, $[0], $.key.nested, $.arr[0].field +fn query_json(value: &JsonValue, query: &str) -> Result { + let query = query.trim(); + let query = if query.starts_with("$.") { + &query[2..] + } else if query.starts_with('$') && query.len() > 1 { + &query[1..] + } else { + query + }; + + let mut current = value.clone(); + + for part in query.split('.') { + if part.is_empty() { + continue; + } + + // Handle array index like [0] + if let Some(idx_start) = part.find('[') { + let key = &part[..idx_start]; + if !key.is_empty() { + if let JsonValue::Object(obj) = ¤t { + current = obj.get(key).cloned().unwrap_or(JsonValue::Null); + } else { + return Err(format!("cannot access property '{}' on non-object", key)); + } + } + + let rest = &part[idx_start..]; + for bracket in rest.split_inclusive(']') { + if bracket.is_empty() || bracket == "]" { + continue; + } + let inner = bracket.trim_end_matches(']'); + if let Some(idx) = inner.strip_prefix('[') { + if let Ok(index) = idx.parse::() { + if let JsonValue::Array(arr) = ¤t { + current = arr.get(index).cloned().unwrap_or(JsonValue::Null); + } else { + return Err(format!("index {} on non-array", index)); + } + } + } + } + } else { + if let JsonValue::Object(obj) = ¤t { + current = obj.get(part).cloned().unwrap_or(JsonValue::Null); + } else { + return Err(format!("property '{}' not found", part)); + } + } + } + + Ok(current) +} + +pub fn register_json_tools(registry: &mut ToolRegistry) { + let p = HashMap::from([ + ("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }), + ("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }), + ("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the JSON or JSONC file".into()), required: true, properties: None, items: None }), + ("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }), + ("query".into(), ToolParam { name: "query".into(), param_type: "string".into(), description: Some("JSONPath-like query (e.g. $.config.items[0].name) to extract a subset of the document".into()), required: false, properties: None, items: None }), + ("schema_depth".into(), ToolParam { name: "schema_depth".into(), param_type: "integer".into(), description: Some("How deep to infer the JSON schema (default: 4)".into()), required: false, properties: None, items: None }), + ("pretty".into(), ToolParam { name: "pretty".into(), param_type: "boolean".into(), description: Some("Pretty-print the output (default: false)".into()), required: false, properties: None, items: None }), + ]); + let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) }; + registry.register( + ToolDefinition::new("read_json") + .description("Parse, validate, and query JSON and JSONC files. Supports JSONPath-like queries ($.key, $.arr[0]), schema inference, and pretty-printing. Automatically detects JSONC (with // comments).") + .parameters(schema), + ToolHandler::new(|ctx, args| { + let gctx = GitToolCtx::new(ctx); + Box::pin(async move { + read_json_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError) + }) + }), + ); +} diff --git a/libs/service/file_tools/markdown.rs b/libs/service/file_tools/markdown.rs new file mode 100644 index 0000000..a3fa466 --- /dev/null +++ b/libs/service/file_tools/markdown.rs @@ -0,0 +1,286 @@ +//! read_markdown — parse and analyze Markdown files. + +use crate::file_tools::MAX_FILE_SIZE; +use crate::git_tools::ctx::GitToolCtx; +use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema}; +use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Parser, Tag, TagEnd}; +use std::collections::HashMap; + +async fn read_markdown_exec( + ctx: GitToolCtx, + args: serde_json::Value, +) -> Result { + let p: serde_json::Map = + serde_json::from_value(args).map_err(|e| e.to_string())?; + + let project_name = p + .get("project_name") + .and_then(|v| v.as_str()) + .ok_or("missing project_name")?; + let repo_name = p + .get("repo_name") + .and_then(|v| v.as_str()) + .ok_or("missing repo_name")?; + let path = p + .get("path") + .and_then(|v| v.as_str()) + .ok_or("missing path")?; + let rev = p + .get("rev") + .and_then(|v| v.as_str()) + .map(String::from) + .unwrap_or_else(|| "HEAD".to_string()); + let include_code = p + .get("include_code") + .and_then(|v| v.as_bool()) + .unwrap_or(true); + let sections_only = p + .get("sections_only") + .and_then(|v| v.as_bool()) + .unwrap_or(false); + + let domain = ctx.open_repo(project_name, repo_name).await?; + + let commit_oid = if rev.len() >= 40 { + git::commit::types::CommitOid::new(&rev) + } else { + domain + .commit_get_prefix(&rev) + .map_err(|e| e.to_string())? + .oid + }; + + let entry = domain + .tree_entry_by_path_from_commit(&commit_oid, path) + .map_err(|e| e.to_string())?; + let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?; + + let data = &content.content; + if data.len() > MAX_FILE_SIZE { + return Err(format!( + "file too large ({} bytes), max {} bytes", + data.len(), + MAX_FILE_SIZE + )); + } + + let text = String::from_utf8_lossy(data); + let parser = Parser::new(&text); + + let mut sections: Vec = Vec::new(); + let mut code_blocks: Vec = Vec::new(); + let mut links: Vec = Vec::new(); + let mut images: Vec = Vec::new(); + + let mut current_heading_level: Option = None; + let mut current_heading_text = String::new(); + let mut in_code_block = false; + let mut code_block_lang = String::new(); + let mut code_block_content = String::new(); + + let mut toc: Vec = Vec::new(); + + for event in parser { + match event { + Event::Start(Tag::Heading { level, .. }) => { + current_heading_level = Some(match level { + HeadingLevel::H1 => 1, + HeadingLevel::H2 => 2, + HeadingLevel::H3 => 3, + HeadingLevel::H4 => 4, + HeadingLevel::H5 => 5, + HeadingLevel::H6 => 6, + }); + current_heading_text.clear(); + } + Event::End(TagEnd::Heading(level)) => { + let lvl = match level { + HeadingLevel::H1 => 1, + HeadingLevel::H2 => 2, + HeadingLevel::H3 => 3, + HeadingLevel::H4 => 4, + HeadingLevel::H5 => 5, + HeadingLevel::H6 => 6, + }; + let heading = current_heading_text.trim().to_string(); + if !heading.is_empty() { + let section = serde_json::json!({ + "level": lvl, + "title": heading, + }); + toc.push(section.clone()); + if !sections_only { + sections.push(serde_json::json!({ + "level": lvl, + "title": heading, + "content": "", + })); + } + } + current_heading_level = None; + } + Event::Text(text) => { + if in_code_block { + code_block_content.push_str(&text); + code_block_content.push('\n'); + } else if let Some(_) = current_heading_level { + current_heading_text.push_str(&text); + current_heading_text.push(' '); + } + } + Event::Code(code) => { + code_blocks.push(serde_json::json!({ + "language": "", + "code": code.as_ref(), + })); + } + Event::Start(Tag::CodeBlock(kind)) => { + in_code_block = true; + code_block_content.clear(); + code_block_lang = match kind { + CodeBlockKind::Fenced(info) => info.as_ref().to_string(), + CodeBlockKind::Indented => String::new(), + }; + } + Event::End(TagEnd::CodeBlock) => { + in_code_block = false; + if include_code { + code_blocks.push(serde_json::json!({ + "language": code_block_lang, + "code": code_block_content.trim().to_string(), + })); + } + code_block_lang.clear(); + } + Event::Start(Tag::Link { dest_url, .. }) => { + links.push(serde_json::json!({ "url": dest_url.to_string() })); + } + Event::Start(Tag::Image { dest_url, .. }) => { + images.push(serde_json::json!({ "url": dest_url.to_string() })); + } + _ => {} + } + } + + // Build outline (h1/h2/h3 only) + let outline: Vec = toc + .iter() + .filter(|s| { + let lvl = s.get("level").and_then(|v| v.as_u64()).unwrap_or(0) as u32; + lvl <= 3 + }) + .cloned() + .collect(); + + Ok(serde_json::json!({ + "path": path, + "rev": rev, + "stats": { + "chars": text.chars().count(), + "words": text.split_whitespace().count(), + "lines": text.lines().count(), + "headings": toc.len(), + "code_blocks": code_blocks.len(), + "links": links.len(), + "images": images.len(), + }, + "outline": outline, + "headings": toc, + "code_blocks": if include_code { code_blocks } else { vec![] }, + "links": links, + "images": images, + })) +} + +pub fn register_markdown_tools(registry: &mut ToolRegistry) { + let p = HashMap::from([ + ( + "project_name".into(), + ToolParam { + name: "project_name".into(), + param_type: "string".into(), + description: Some("Project name (slug)".into()), + required: true, + properties: None, + items: None, + }, + ), + ( + "repo_name".into(), + ToolParam { + name: "repo_name".into(), + param_type: "string".into(), + description: Some("Repository name".into()), + required: true, + properties: None, + items: None, + }, + ), + ( + "path".into(), + ToolParam { + name: "path".into(), + param_type: "string".into(), + description: Some("File path to the Markdown file".into()), + required: true, + properties: None, + items: None, + }, + ), + ( + "rev".into(), + ToolParam { + name: "rev".into(), + param_type: "string".into(), + description: Some("Git revision (default: HEAD)".into()), + required: false, + properties: None, + items: None, + }, + ), + ( + "sections_only".into(), + ToolParam { + name: "sections_only".into(), + param_type: "boolean".into(), + description: Some( + "If true, return only section headings (outline). Default: false".into(), + ), + required: false, + properties: None, + items: None, + }, + ), + ( + "include_code".into(), + ToolParam { + name: "include_code".into(), + param_type: "boolean".into(), + description: Some("Include code blocks in result. Default: true".into()), + required: false, + properties: None, + items: None, + }, + ), + ]); + let schema = ToolSchema { + schema_type: "object".into(), + properties: Some(p), + required: Some(vec![ + "project_name".into(), + "repo_name".into(), + "path".into(), + ]), + }; + registry.register( + ToolDefinition::new("read_markdown") + .description("Parse and analyze a Markdown file. Returns document statistics, heading outline, code blocks with languages, links, and images.") + .parameters(schema), + ToolHandler::new(|ctx, args| { + let gctx = GitToolCtx::new(ctx); + Box::pin(async move { + read_markdown_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError) + }) + }), + ); +} diff --git a/libs/service/file_tools/mod.rs b/libs/service/file_tools/mod.rs new file mode 100644 index 0000000..40f629f --- /dev/null +++ b/libs/service/file_tools/mod.rs @@ -0,0 +1,39 @@ +//! File reading and search tools for AI agents. +//! +//! Tools for reading structured files (CSV, Excel, Word, PDF, PPT, Markdown, +//! SQL, JSON) and searching across repository files (git_grep). +//! +//! All tools operate on repository blobs (read via git context) or standalone +//! content, returning structured JSON suitable for AI consumption. + +pub mod csv; +// TODO: fix calamine 0.26 API compatibility (open_workbook path requirement) +// pub mod excel; +pub mod grep; +pub mod json; +pub mod markdown; +// TODO: fix lopdf 0.34 API (no load_from_mem, different stream API) +// pub mod pdf; +// TODO: fix ppt archive borrow checker issue +// pub mod ppt; +pub mod sql; +// TODO: fix quick-xml 0.37 + zip Cursor API +// pub mod word; + +use agent::ToolRegistry; + +/// Maximum number of bytes to read from any single file (prevents huge blobs). +const MAX_FILE_SIZE: usize = 2 * 1024 * 1024; // 2MB + +/// Registers all file tools into a ToolRegistry. +pub fn register_all(registry: &mut ToolRegistry) { + grep::register_grep_tools(registry); + csv::register_csv_tools(registry); + // excel::register_excel_tools(registry); + // word::register_word_tools(registry); + // pdf::register_pdf_tools(registry); + // ppt::register_ppt_tools(registry); + markdown::register_markdown_tools(registry); + sql::register_sql_tools(registry); + json::register_json_tools(registry); +} diff --git a/libs/service/file_tools/pdf.rs b/libs/service/file_tools/pdf.rs new file mode 100644 index 0000000..2cd5261 --- /dev/null +++ b/libs/service/file_tools/pdf.rs @@ -0,0 +1,244 @@ +//! read_pdf — extract text from PDF files. + +use crate::file_tools::MAX_FILE_SIZE; +use crate::git_tools::ctx::GitToolCtx; +use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema}; +use futures::FutureExt; +use lopdf::{Document, Object, ObjectId}; +use std::collections::HashMap; + +/// Extract text content from a PDF page's content stream. +fn extract_page_text(doc: &Document, page_id: ObjectId) -> String { + let mut text = String::new(); + + // Get page dictionary + let page_dict = match doc.get(page_id) { + Ok(dict) => dict, + Err(_) => return text, + }; + + // Get content streams (can be a single stream or array) + let content_streams = match page_dict.get(b"Contents") { + Ok(obj) => obj.clone(), + Err(_) => return text, + }; + + let stream_ids: Vec = match &content_streams { + Object::Reference(id) => vec![*id], + Object::Array(arr) => arr + .iter() + .filter_map(|o| { + if let Object::Reference(id) = o { + Some(*id) + } else { + None + } + }) + .collect(), + _ => return text, + }; + + for stream_id in stream_ids { + if let Ok((_, stream)) = doc.get_stream(stream_id) { + // Decode the stream + if let Ok(decompressed) = stream.decompressed_content() { + text.push_str(&extract_text_from_content(&decompress_pdf_stream(&decompressed))); + text.push('\n'); + } + } + } + + text +} + +/// Very simple PDF content stream text extraction. +/// Handles Tj, TJ, Td, T*, ', " operators. +fn extract_text_from_content(content: &[u8]) -> String { + let data = String::from_utf8_lossy(content); + let mut result = String::new(); + let mut in_parens = false; + let mut current_text = String::new(); + let mut last_was_tj = false; + + let mut chars = data.chars().peekable(); + + while let Some(c) = chars.next() { + match c { + '(' => { + in_parens = true; + current_text.clear(); + } + ')' if in_parens => { + in_parens = false; + if !current_text.is_empty() { + if last_was_tj { + // TJ operator: subtract current text width offset + } + result.push_str(¤t_text); + result.push(' '); + last_was_tj = false; + } + } + c if in_parens => { + if c == '\\' { + if let Some(escaped) = chars.next() { + match escaped { + 'n' => current_text.push('\n'), + 'r' => current_text.push('\r'), + 't' => current_text.push('\t'), + _ => current_text.push(escaped), + } + } + } else { + current_text.push(c); + } + } + '%' => { + // Comment, skip to end of line + while let Some(nc) = chars.next() { + if nc == '\n' || nc == '\r' { + break; + } + } + } + _ => {} + } + } + + // Clean up excessive newlines + let lines: Vec<&str> = result.lines().map(|l| l.trim()).filter(|l| !l.is_empty()).collect(); + lines.join("\n") +} + +fn decompress_pdf_stream(data: &[u8]) -> Vec { + // Try to detect and decompress flate/zlib streams + if data.len() < 2 { + return data.to_vec(); + } + + // Simple zlib check: zlib-wrapped deflate starts with 0x78 + if data.starts_with(&[0x78]) || data.starts_with(&[0x08, 0x1b]) { + if let Ok(decoded) = flate2::read::ZlibDecoder::new(data).bytes().collect::, _>>() { + return decoded; + } + } + + // Try raw deflate + if let Ok(decoded) = flate2::read::DeflateDecoder::new(data).bytes().collect::, _>>() { + return decoded; + } + + data.to_vec() +} + +async fn read_pdf_exec( + ctx: GitToolCtx, + args: serde_json::Value, +) -> Result { + let p: serde_json::Map = + serde_json::from_value(args).map_err(|e| e.to_string())?; + + let project_name = p + .get("project_name") + .and_then(|v| v.as_str()) + .ok_or("missing project_name")?; + let repo_name = p + .get("repo_name") + .and_then(|v| v.as_str()) + .ok_or("missing repo_name")?; + let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?; + let rev = p + .get("rev") + .and_then(|v| v.as_str()) + .map(String::from) + .unwrap_or_else(|| "HEAD".to_string()); + let page_start = p.get("page_start").and_then(|v| v.as_u64()).map(|v| v as usize); + let page_end = p.get("page_end").and_then(|v| v.as_u64()).map(|v| v as usize); + let max_pages = p + .get("max_pages") + .and_then(|v| v.as_u64()) + .unwrap_or(20) as usize; + + let domain = ctx.open_repo(project_name, repo_name).await?; + + let commit_oid = if rev.len() >= 40 { + git::commit::types::CommitOid::new(&rev) + } else { + domain + .commit_get_prefix(&rev) + .map_err(|e| e.to_string())? + .oid + }; + + let entry = domain + .tree_entry_by_path_from_commit(&commit_oid, path) + .map_err(|e| e.to_string())?; + let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?; + + let data = &content.content; + if data.len() > MAX_FILE_SIZE { + return Err(format!( + "file too large ({} bytes), max {} bytes", + data.len(), + MAX_FILE_SIZE + )); + } + + let doc = Document::load_from_mem(data) + .map_err(|e| format!("failed to parse PDF: {}", e))?; + + // Get all page references + let pages: Vec = doc + .pages + .values() + .cloned() + .collect(); + + let total_pages = pages.len(); + + let start = page_start.unwrap_or(0).min(total_pages.saturating_sub(1)); + let end = page_end.unwrap_or(start + max_pages).min(total_pages); + + let mut page_texts: Vec = Vec::new(); + + for (i, page_id) in pages.iter().enumerate().skip(start).take(end - start) { + let text = extract_page_text(&doc, *page_id); + page_texts.push(serde_json::json!({ + "page": i + 1, + "text": text, + "char_count": text.chars().count(), + })); + } + + Ok(serde_json::json!({ + "path": path, + "rev": rev, + "total_pages": total_pages, + "extracted_pages": page_texts.len(), + "pages": page_texts, + })) +} + +pub fn register_pdf_tools(registry: &mut ToolRegistry) { + let p = HashMap::from([ + ("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }), + ("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }), + ("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the PDF document".into()), required: true, properties: None, items: None }), + ("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }), + ("page_start".into(), ToolParam { name: "page_start".into(), param_type: "integer".into(), description: Some("1-based starting page number (default: 1)".into()), required: false, properties: None, items: None }), + ("page_end".into(), ToolParam { name: "page_end".into(), param_type: "integer".into(), description: Some("1-based ending page number (default: page_start + 20)".into()), required: false, properties: None, items: None }), + ("max_pages".into(), ToolParam { name: "max_pages".into(), param_type: "integer".into(), description: Some("Maximum number of pages to extract (default: 20)".into()), required: false, properties: None, items: None }), + ]); + let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) }; + registry.register( + ToolDefinition::new("read_pdf") + .description("Extract text content from PDF files. Returns page-by-page text extraction with character counts. Supports page range selection.") + .parameters(schema), + ToolHandler::new(|ctx, args| { + let gctx = GitToolCtx::new(ctx); + Box::pin(async move { + read_pdf_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError) + }) + }), + ); +} diff --git a/libs/service/file_tools/ppt.rs b/libs/service/file_tools/ppt.rs new file mode 100644 index 0000000..ae969ac --- /dev/null +++ b/libs/service/file_tools/ppt.rs @@ -0,0 +1,204 @@ +//! read_ppt — extract text from PowerPoint files (.pptx). + +use crate::file_tools::MAX_FILE_SIZE; +use crate::git_tools::ctx::GitToolCtx; +use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema}; +use futures::FutureExt; +use std::collections::HashMap; +use zip::ZipArchive; + +async fn read_ppt_exec( + ctx: GitToolCtx, + args: serde_json::Value, +) -> Result { + let p: serde_json::Map = + serde_json::from_value(args).map_err(|e| e.to_string())?; + + let project_name = p + .get("project_name") + .and_then(|v| v.as_str()) + .ok_or("missing project_name")?; + let repo_name = p + .get("repo_name") + .and_then(|v| v.as_str()) + .ok_or("missing repo_name")?; + let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?; + let rev = p + .get("rev") + .and_then(|v| v.as_str()) + .map(String::from) + .unwrap_or_else(|| "HEAD".to_string()); + let slide_start = p.get("slide_start").and_then(|v| v.as_u64()).map(|v| v as usize); + let slide_end = p.get("slide_end").and_then(|v| v.as_u64()).map(|v| v as usize); + let include_notes = p + .get("include_notes") + .and_then(|v| v.as_bool()) + .unwrap_or(false); + + let domain = ctx.open_repo(project_name, repo_name).await?; + + let commit_oid = if rev.len() >= 40 { + git::commit::types::CommitOid::new(&rev) + } else { + domain + .commit_get_prefix(&rev) + .map_err(|e| e.to_string())? + .oid + }; + + let entry = domain + .tree_entry_by_path_from_commit(&commit_oid, path) + .map_err(|e| e.to_string())?; + let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?; + + let data = &content.content; + if data.len() > MAX_FILE_SIZE { + return Err(format!( + "file too large ({} bytes), max {} bytes", + data.len(), + MAX_FILE_SIZE + )); + } + + let cursor = std::io::Cursor::new(data.clone()); + let mut archive = + ZipArchive::new(cursor).map_err(|e| format!("failed to read PPTX ZIP: {}", e))?; + + let mut slides: Vec = Vec::new(); + + // Collect all slide file names + let mut slide_files: Vec = (1..=1000) + .filter_map(|i| { + let name = format!("ppt/slides/slide{}.xml", i); + if archive.by_name(&name).is_ok() { + Some(name) + } else { + None + } + }) + .collect(); + + let total_slides = slide_files.len(); + let start = slide_start.unwrap_or(0).min(total_slides.saturating_sub(1)); + let end = slide_end.unwrap_or(start + 50).min(total_slides); + + for slide_file in slide_files.iter().skip(start).take(end - start) { + let slide_idx = slides.len() + start + 1; + + let mut file = archive + .by_name(slide_file) + .map_err(|e| format!("failed to read slide {}: {}", slide_file, e))?; + let mut xml_content = String::new(); + use std::io::Read; + file.read_to_string(&mut xml_content) + .map_err(|e| e.to_string())?; + + // Extract text from slide XML + let text = extract_text_from_pptx_xml(&xml_content); + + // Optionally extract notes + let notes = if include_notes { + let notes_file = format!("ppt/notesSlides/notesSlide{}.xml", slide_idx); + if let Ok(mut notes_file) = archive.by_name(¬es_file) { + let mut notes_xml = String::new(); + if notes_file.read_to_string(&mut notes_xml).is_ok() { + Some(extract_text_from_pptx_xml(¬es_xml)) + } else { + None + } + } else { + None + } + } else { + None + }; + + slides.push(serde_json::json!({ + "slide": slide_idx, + "text": text.clone(), + "char_count": text.chars().count(), + "notes": notes, + })); + } + + Ok(serde_json::json!({ + "path": path, + "rev": rev, + "total_slides": total_slides, + "extracted_slides": slides.len(), + "slides": slides, + })) +} + +/// Extract text content from PPTX slide XML using simple tag extraction. +fn extract_text_from_pptx_xml(xml: &str) -> String { + // PPTX uses tags for text content + let mut results: Vec<&str> = Vec::new(); + let mut last_end = 0; + + while let Some(start) = xml[last_end..].find("') { + let content_start = abs_start + tag_end + 1; + if let Some(end_tag) = xml[content_start..].find("") { + let text = &xml[content_start..content_start + end_tag]; + let trimmed = text.trim(); + if !trimmed.is_empty() { + results.push(trimmed); + } + last_end = content_start + end_tag + 7; // len of + } else { + break; + } + } else { + break; + } + } + + // Also try tags (notes slides use Word namespaces) + let mut last_end = 0; + while let Some(start) = xml[last_end..].find("') { + let content_start = abs_start + tag_end + 1; + if let Some(end_tag) = xml[content_start..].find("") { + let text = &xml[content_start..content_start + end_tag]; + let trimmed = text.trim(); + if !trimmed.is_empty() && !results.contains(&trimmed) { + results.push(trimmed); + } + last_end = content_start + end_tag + 6; // len of + } else { + break; + } + } else { + break; + } + } + + results.join(" ") +} + +pub fn register_ppt_tools(registry: &mut ToolRegistry) { + let p = HashMap::from([ + ("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }), + ("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }), + ("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the .pptx document".into()), required: true, properties: None, items: None }), + ("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }), + ("slide_start".into(), ToolParam { name: "slide_start".into(), param_type: "integer".into(), description: Some("1-based starting slide number (default: 1)".into()), required: false, properties: None, items: None }), + ("slide_end".into(), ToolParam { name: "slide_end".into(), param_type: "integer".into(), description: Some("1-based ending slide number".into()), required: false, properties: None, items: None }), + ("include_notes".into(), ToolParam { name: "include_notes".into(), param_type: "boolean".into(), description: Some("Include speaker notes (default: false)".into()), required: false, properties: None, items: None }), + ]); + let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) }; + registry.register( + ToolDefinition::new("read_ppt") + .description("Extract text content from PowerPoint presentations (.pptx). Returns slide-by-slide text with character counts. Supports slide range selection and speaker notes.") + .parameters(schema), + ToolHandler::new(|ctx, args| { + let gctx = GitToolCtx::new(ctx); + Box::pin(async move { + read_ppt_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError) + }) + }), + ); +} diff --git a/libs/service/file_tools/sql.rs b/libs/service/file_tools/sql.rs new file mode 100644 index 0000000..0f6c3e3 --- /dev/null +++ b/libs/service/file_tools/sql.rs @@ -0,0 +1,154 @@ +//! read_sql — parse and analyze SQL files. + +use crate::file_tools::MAX_FILE_SIZE; +use crate::git_tools::ctx::GitToolCtx; +use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema}; +use sqlparser::ast::{Statement, ColumnDef}; +use sqlparser::dialect::{GenericDialect, MySqlDialect, PostgreSqlDialect, SQLiteDialect}; +use sqlparser::parser::Parser; +use std::collections::HashMap; + +async fn read_sql_exec( + ctx: GitToolCtx, + args: serde_json::Value, +) -> Result { + let p: serde_json::Map = + serde_json::from_value(args).map_err(|e| e.to_string())?; + + let project_name = p + .get("project_name") + .and_then(|v| v.as_str()) + .ok_or("missing project_name")?; + let repo_name = p + .get("repo_name") + .and_then(|v| v.as_str()) + .ok_or("missing repo_name")?; + let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?; + let rev = p + .get("rev") + .and_then(|v| v.as_str()) + .map(String::from) + .unwrap_or_else(|| "HEAD".to_string()); + let dialect = p.get("dialect").and_then(|v| v.as_str()).unwrap_or("generic"); + + let domain = ctx.open_repo(project_name, repo_name).await?; + + let commit_oid = if rev.len() >= 40 { + git::commit::types::CommitOid::new(&rev) + } else { + domain + .commit_get_prefix(&rev) + .map_err(|e| e.to_string())? + .oid + }; + + let entry = domain + .tree_entry_by_path_from_commit(&commit_oid, path) + .map_err(|e| e.to_string())?; + let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?; + + let data = &content.content; + if data.len() > MAX_FILE_SIZE { + return Err(format!( + "file too large ({} bytes), max {} bytes", + data.len(), + MAX_FILE_SIZE + )); + } + + let text = String::from_utf8_lossy(data); + + let parser_dialect: Box = match dialect { + "mysql" => Box::new(MySqlDialect {}), + "postgresql" | "postgres" => Box::new(PostgreSqlDialect {}), + "sqlite" => Box::new(SQLiteDialect {}), + _ => Box::new(GenericDialect {}), + }; + + let statements = Parser::parse_sql(parser_dialect.as_ref(), &text) + .map_err(|e| format!("SQL parse error: {}", e))?; + + let mut tables: Vec = Vec::new(); + let mut views: Vec = Vec::new(); + let mut functions: Vec = Vec::new(); + let mut indexes: Vec = Vec::new(); + let mut statement_kinds: std::collections::HashMap = std::collections::HashMap::new(); + + for statement in &statements { + let kind = format!("{:?}", statement).split('{').next().unwrap_or("unknown").to_string(); + *statement_kinds.entry(kind).or_insert(0) += 1; + + match statement { + Statement::CreateTable(stmt) => { + let name = stmt.name.to_string(); + let columns: Vec = stmt.columns.iter().map(format_column_def).collect(); + tables.push(serde_json::json!({ + "name": name, + "columns": columns, + "if_not_exists": stmt.if_not_exists, + })); + } + Statement::CreateView { name, query, .. } => { + views.push(serde_json::json!({ + "name": name.to_string(), + "query": query.to_string(), + })); + } + Statement::CreateIndex(stmt) => { + indexes.push(serde_json::json!({ + "name": stmt.name.as_ref().map(|n| n.to_string()).unwrap_or_default(), + "table": stmt.table_name.to_string(), + "columns": stmt.columns.iter().map(|c| c.to_string()).collect::>(), + })); + } + Statement::CreateFunction(stmt) => { + functions.push(serde_json::json!({ + "name": stmt.name.to_string(), + "args": stmt.args.iter().flat_map(|args| args.iter().filter_map(|a| a.name.as_ref().map(|n| n.to_string()))).collect::>(), + "return_type": stmt.return_type.as_ref().map(|r| r.to_string()).unwrap_or_default(), + })); + } + _ => {} + } + } + + Ok(serde_json::json!({ + "path": path, + "rev": rev, + "dialect": dialect, + "statement_count": statements.len(), + "statement_kinds": statement_kinds, + "tables": tables, + "views": views, + "functions": functions, + "indexes": indexes, + })) +} + +fn format_column_def(col: &ColumnDef) -> String { + let name = col.name.to_string(); + let data_type = col.data_type.to_string(); + format!("{} {}", name, data_type) +} + +pub fn register_sql_tools(registry: &mut ToolRegistry) { + let p = HashMap::from([ + ("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }), + ("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }), + ("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the SQL file".into()), required: true, properties: None, items: None }), + ("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }), + ("dialect".into(), ToolParam { name: "dialect".into(), param_type: "string".into(), description: Some("SQL dialect: generic, mysql, postgresql, sqlite. Default: generic".into()), required: false, properties: None, items: None }), + ]); + let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) }; + registry.register( + ToolDefinition::new("read_sql") + .description("Parse and analyze a SQL file. Extracts CREATE TABLE statements (with columns and types), CREATE VIEW, CREATE INDEX, CREATE FUNCTION, and counts all statement types.") + .parameters(schema), + ToolHandler::new(|ctx, args| { + let gctx = GitToolCtx::new(ctx); + Box::pin(async move { + read_sql_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError) + }) + }), + ); +} diff --git a/libs/service/file_tools/word.rs b/libs/service/file_tools/word.rs new file mode 100644 index 0000000..b88007e --- /dev/null +++ b/libs/service/file_tools/word.rs @@ -0,0 +1,184 @@ +//! read_word — parse and extract text from Word documents (.docx) via zip+xml. + +use crate::file_tools::MAX_FILE_SIZE; +use crate::git_tools::ctx::GitToolCtx; +use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema}; +use futures::FutureExt; +use quick_xml::events::Event; +use quick_xml::Reader; +use std::collections::HashMap; +use zip::ZipArchive; + +async fn read_word_exec( + ctx: GitToolCtx, + args: serde_json::Value, +) -> Result { + let p: serde_json::Map = + serde_json::from_value(args).map_err(|e| e.to_string())?; + + let project_name = p + .get("project_name") + .and_then(|v| v.as_str()) + .ok_or("missing project_name")?; + let repo_name = p + .get("repo_name") + .and_then(|v| v.as_str()) + .ok_or("missing repo_name")?; + let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?; + let rev = p + .get("rev") + .and_then(|v| v.as_str()) + .map(String::from) + .unwrap_or_else(|| "HEAD".to_string()); + let offset = p.get("offset").and_then(|v| v.as_u64()).unwrap_or(0) as usize; + let limit = p + .get("limit") + .and_then(|v| v.as_u64()) + .unwrap_or(200) as usize; + let sections_only = p + .get("sections_only") + .and_then(|v| v.as_bool()) + .unwrap_or(false); + + let domain = ctx.open_repo(project_name, repo_name).await?; + + let commit_oid = if rev.len() >= 40 { + git::commit::types::CommitOid::new(&rev) + } else { + domain + .commit_get_prefix(&rev) + .map_err(|e| e.to_string())? + .oid + }; + + let entry = domain + .tree_entry_by_path_from_commit(&commit_oid, path) + .map_err(|e| e.to_string())?; + let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?; + + let data = &content.content; + if data.len() > MAX_FILE_SIZE { + return Err(format!( + "file too large ({} bytes), max {} bytes", + data.len(), + MAX_FILE_SIZE + )); + } + + // DOCX is a ZIP archive. Read word/document.xml from it. + let cursor = std::io::Cursor::new(data); + let mut archive = ZipArchive::new(cursor).map_err(|e| { + format!( + "failed to open docx as ZIP archive: {}. Make sure the file is a valid .docx document.", + e + ) + })?; + + let doc_xml = { + let file = if let Ok(f) = archive.by_name("word/document.xml") { + f + } else { + archive.by_name("document.xml") + .map_err(|_| "docx archive does not contain word/document.xml or document.xml")? + }; + let mut s = String::new(); + let mut reader = std::io::BufReader::new(file); + std::io::Read::read_to_string(&mut reader, &mut s) + .map_err(|e| format!("failed to read document.xml: {}", e))?; + s + }; + + // Parse paragraphs from elements + let mut reader = Reader::from_str(&doc_xml); + reader.config_mut().trim_text(false); + + let mut paragraphs: Vec = Vec::new(); + let mut buf = Vec::new(); + let mut in_paragraph = false; + let mut current_text = String::new(); + + loop { + match reader.read_event_into(&mut buf) { + Ok(Event::Start(e)) => { + if e.name().as_ref() == b"w:p" { + in_paragraph = true; + current_text.clear(); + } + } + Ok(Event::Text(e)) => { + if in_paragraph { + let txt = e.unescape().map(|s| s.into_owned()).unwrap_or_default(); + current_text.push_str(&txt); + } + } + Ok(Event::End(e)) => { + if e.name().as_ref() == b"w:p" && in_paragraph { + in_paragraph = false; + let text = current_text.trim().to_string(); + if !text.is_empty() { + paragraphs.push(text); + } + } + } + Ok(Event::Eof) => break, + _ => {} + } + buf.clear(); + } + + let total = paragraphs.len(); + + let body: Vec = if sections_only { + paragraphs + .iter() + .enumerate() + .filter(|(_, text)| { + text.chars().next().map(|c| c.is_uppercase()).unwrap_or(false) + && text.chars().filter(|&c| c == ' ').count() < text.len() / 2 + && text.len() < 200 + }) + .skip(offset) + .take(limit) + .map(|(i, t)| serde_json::json!({ "index": i, "text": t })) + .collect() + } else { + paragraphs + .iter() + .skip(offset) + .take(limit) + .enumerate() + .map(|(i, t)| serde_json::json!({ "index": offset + i, "text": t })) + .collect() + }; + + Ok(serde_json::json!({ + "path": path, + "rev": rev, + "paragraph_count": total, + "paragraphs": body, + })) +} + +pub fn register_word_tools(registry: &mut ToolRegistry) { + let p = HashMap::from([ + ("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }), + ("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }), + ("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the .docx document".into()), required: true, properties: None, items: None }), + ("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }), + ("sections_only".into(), ToolParam { name: "sections_only".into(), param_type: "boolean".into(), description: Some("If true, extract only section/heading-like paragraphs (short lines starting with uppercase)".into()), required: false, properties: None, items: None }), + ("offset".into(), ToolParam { name: "offset".into(), param_type: "integer".into(), description: Some("Number of paragraphs to skip (default: 0)".into()), required: false, properties: None, items: None }), + ("limit".into(), ToolParam { name: "limit".into(), param_type: "integer".into(), description: Some("Maximum paragraphs to return (default: 200)".into()), required: false, properties: None, items: None }), + ]); + let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) }; + registry.register( + ToolDefinition::new("read_word") + .description("Parse and extract text from Word documents (.docx). Returns paragraphs with index and text content. Supports pagination.") + .parameters(schema), + ToolHandler::new(|ctx, args| { + let gctx = GitToolCtx::new(ctx); + Box::pin(async move { + read_word_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError) + }) + }), + ); +} diff --git a/libs/service/git_tools/tree.rs b/libs/service/git_tools/tree.rs index 64c77bf..46db052 100644 --- a/libs/service/git_tools/tree.rs +++ b/libs/service/git_tools/tree.rs @@ -91,6 +91,38 @@ async fn git_file_history_exec(ctx: GitToolCtx, args: serde_json::Value) -> Resu Ok(serde_json::to_value(result).map_err(|e| e.to_string())?) } +async fn git_blob_get_exec(ctx: GitToolCtx, args: serde_json::Value) -> Result { + let p: serde_json::Map = serde_json::from_value(args).map_err(|e| e.to_string())?; + let project_name = p.get("project_name").and_then(|v| v.as_str()).ok_or("missing project_name")?; + let repo_name = p.get("repo_name").and_then(|v| v.as_str()).ok_or("missing repo_name")?; + let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?; + let rev = p.get("rev").and_then(|v| v.as_str()).map(String::from).unwrap_or_else(|| "HEAD".to_string()); + + let domain = ctx.open_repo(project_name, repo_name).await?; + let oid = if rev.len() >= 40 { + git::commit::types::CommitOid::new(&rev) + } else { + domain.commit_get_prefix(&rev).map_err(|e| e.to_string())?.oid + }; + + let entry = domain.tree_entry_by_path_from_commit(&oid, path).map_err(|e| e.to_string())?; + let blob_info = domain.blob_get(&entry.oid).map_err(|e| e.to_string())?; + + if blob_info.is_binary { + return Err(format!("file '{}' is binary, cannot return as text", path)); + } + + let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?; + let text = String::from_utf8_lossy(&content.content).to_string(); + + Ok(serde_json::json!({ + "path": path, + "oid": entry.oid.to_string(), + "size": blob_info.size, + "content": text, + })) +} + fn flatten_commit(c: &git::commit::types::CommitMeta) -> serde_json::Value { use chrono::TimeZone; let ts = c.author.time_secs + (c.author.offset_minutes as i64 * 60); @@ -162,4 +194,22 @@ pub fn register_git_tools(registry: &mut ToolRegistry) { }) }), ); + + // git_blob_get + let p = HashMap::from([ + ("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }), + ("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }), + ("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path within the repository".into()), required: true, properties: None, items: None }), + ("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Revision to read file from (default: HEAD)".into()), required: false, properties: None, items: None }), + ]); + let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) }; + registry.register( + ToolDefinition::new("git_blob_get").description("Retrieve the raw content of a single file (blob) at a given revision. Returns error if the file is binary.").parameters(schema), + ToolHandler::new(|ctx, args| { + let gctx = super::ctx::GitToolCtx::new(ctx); + Box::pin(async move { + git_blob_get_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError) + }) + }), + ); } \ No newline at end of file diff --git a/libs/service/lib.rs b/libs/service/lib.rs index 6a4a832..a803c52 100644 --- a/libs/service/lib.rs +++ b/libs/service/lib.rs @@ -148,6 +148,7 @@ impl AppService { let client = async_openai::Client::with_config(cfg); let mut registry = ToolRegistry::new(); git_tools::register_all(&mut registry); + file_tools::register_all(&mut registry); Some(Arc::new(ChatService::new(client).with_tool_registry(registry))) } (Err(e), _) => { @@ -229,6 +230,7 @@ pub mod auth; pub mod error; pub mod git; pub mod git_tools; +pub mod file_tools; pub mod issue; pub mod project; pub mod pull_request;