gitdataai/libs/service/file_tools/json.rs
ZhenYi 1af796ac75 feat(service): add file_tools module and git_blob_get tool
Add AI-accessible tools for reading structured files (CSV, JSON/JSONC,
Markdown, SQL) and searching repository content (git_grep). Also adds
git_blob_get to retrieve raw blob text content with binary detection.

Deferred: Excel, Word, PDF, PPT modules are stubbed out due to library
API incompatibilities (calamine 0.26, lopdf 0.34, quick-xml 0.37).
2026-04-18 23:02:10 +08:00

276 lines
10 KiB
Rust

//! read_json — parse, validate, and query JSON / JSONC files.
use crate::file_tools::MAX_FILE_SIZE;
use crate::git_tools::ctx::GitToolCtx;
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
use serde_json::Value as JsonValue;
use std::collections::HashMap;
/// Remove comments from JSONC (lines starting with // or /* */) for parsing.
fn strip_jsonc_comments(input: &str) -> String {
let mut result = String::with_capacity(input.len());
let mut chars = input.chars().peekable();
let mut in_string = false;
let mut escaped = false;
while let Some(c) = chars.next() {
if escaped {
result.push(c);
escaped = false;
continue;
}
if c == '\\' && in_string {
result.push(c);
escaped = true;
continue;
}
if c == '"' {
result.push(c);
in_string = !in_string;
continue;
}
if !in_string {
if c == '/' {
if let Some(&next) = chars.peek() {
if next == '/' {
// Line comment — skip to end of line
chars.next();
while let Some(nc) = chars.next() {
if nc == '\n' {
result.push(nc);
break;
}
}
continue;
} else if next == '*' {
// Block comment — skip until */
chars.next();
while let Some(nc) = chars.next() {
if nc == '*' {
if let Some(&'/') = chars.peek() {
chars.next();
break;
}
}
}
continue;
}
}
}
}
result.push(c);
}
result
}
fn infer_schema(value: &JsonValue, max_depth: usize) -> JsonValue {
if max_depth == 0 {
return serde_json::json!({ "type": "MAX_DEPTH" });
}
match value {
JsonValue::Null => serde_json::json!({ "type": "null" }),
JsonValue::Bool(_) => serde_json::json!({ "type": "boolean" }),
JsonValue::Number(_) => serde_json::json!({ "type": "number" }),
JsonValue::String(_) => serde_json::json!({ "type": "string" }),
JsonValue::Array(arr) => {
if arr.is_empty() {
serde_json::json!({ "type": "array", "items": null })
} else {
serde_json::json!({
"type": "array",
"length": arr.len(),
"items": infer_schema(&arr[0], max_depth - 1)
})
}
}
JsonValue::Object(obj) => {
let mut schema = serde_json::Map::new();
schema.insert("type".into(), serde_json::Value::String("object".into()));
let mut properties = serde_json::Map::new();
for (k, v) in obj {
properties.insert(k.clone(), infer_schema(v, max_depth - 1));
}
schema.insert("properties".into(), serde_json::Value::Object(properties));
schema.insert("keyCount".into(), serde_json::json!(obj.len()));
serde_json::Value::Object(schema)
}
}
}
async fn read_json_exec(
ctx: GitToolCtx,
args: serde_json::Value,
) -> Result<serde_json::Value, String> {
let p: serde_json::Map<String, serde_json::Value> =
serde_json::from_value(args).map_err(|e| e.to_string())?;
let project_name = p
.get("project_name")
.and_then(|v| v.as_str())
.ok_or("missing project_name")?;
let repo_name = p
.get("repo_name")
.and_then(|v| v.as_str())
.ok_or("missing repo_name")?;
let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
let rev = p
.get("rev")
.and_then(|v| v.as_str())
.map(String::from)
.unwrap_or_else(|| "HEAD".to_string());
let query = p.get("query").and_then(|v| v.as_str()).map(String::from);
let max_depth = p.get("schema_depth").and_then(|v| v.as_u64()).unwrap_or(4) as usize;
let pretty = p.get("pretty").and_then(|v| v.as_bool()).unwrap_or(false);
let domain = ctx.open_repo(project_name, repo_name).await?;
let commit_oid = if rev.len() >= 40 {
git::commit::types::CommitOid::new(&rev)
} else {
domain
.commit_get_prefix(&rev)
.map_err(|e| e.to_string())?
.oid
};
let entry = domain
.tree_entry_by_path_from_commit(&commit_oid, path)
.map_err(|e| e.to_string())?;
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
let data = &content.content;
if data.len() > MAX_FILE_SIZE {
return Err(format!(
"file too large ({} bytes), max {} bytes",
data.len(),
MAX_FILE_SIZE
));
}
let text = String::from_utf8_lossy(data);
let is_jsonc = path.ends_with(".jsonc") || path.ends_with(".vscodeignore") || text.contains("//");
let json_text = if is_jsonc {
strip_jsonc_comments(&text)
} else {
text.to_string()
};
let parsed: JsonValue = serde_json::from_str(&json_text)
.map_err(|e| format!("JSON parse error at {}: {}", e.line(), e))?;
// Apply JSONPath-like query
let result = if let Some(ref q) = query {
query_json(&parsed, q)?
} else {
parsed
};
let schema = infer_schema(&result, max_depth);
let display = if pretty {
serde_json::to_string_pretty(&result).unwrap_or_default()
} else {
serde_json::to_string(&result).unwrap_or_default()
};
Ok(serde_json::json!({
"path": path,
"rev": rev,
"format": if is_jsonc { "jsonc" } else { "json" },
"size_bytes": data.len(),
"schema": schema,
"data": if display.chars().count() > 5000 {
format!("{}... (truncated, {} chars total)", &display[..5000], display.chars().count())
} else { display },
}))
}
/// Simple JSONPath-like query support.
/// Supports: $.key, $[0], $.key.nested, $.arr[0].field
fn query_json(value: &JsonValue, query: &str) -> Result<JsonValue, String> {
let query = query.trim();
let query = if query.starts_with("$.") {
&query[2..]
} else if query.starts_with('$') && query.len() > 1 {
&query[1..]
} else {
query
};
let mut current = value.clone();
for part in query.split('.') {
if part.is_empty() {
continue;
}
// Handle array index like [0]
if let Some(idx_start) = part.find('[') {
let key = &part[..idx_start];
if !key.is_empty() {
if let JsonValue::Object(obj) = &current {
current = obj.get(key).cloned().unwrap_or(JsonValue::Null);
} else {
return Err(format!("cannot access property '{}' on non-object", key));
}
}
let rest = &part[idx_start..];
for bracket in rest.split_inclusive(']') {
if bracket.is_empty() || bracket == "]" {
continue;
}
let inner = bracket.trim_end_matches(']');
if let Some(idx) = inner.strip_prefix('[') {
if let Ok(index) = idx.parse::<usize>() {
if let JsonValue::Array(arr) = &current {
current = arr.get(index).cloned().unwrap_or(JsonValue::Null);
} else {
return Err(format!("index {} on non-array", index));
}
}
}
}
} else {
if let JsonValue::Object(obj) = &current {
current = obj.get(part).cloned().unwrap_or(JsonValue::Null);
} else {
return Err(format!("property '{}' not found", part));
}
}
}
Ok(current)
}
pub fn register_json_tools(registry: &mut ToolRegistry) {
let p = HashMap::from([
("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the JSON or JSONC file".into()), required: true, properties: None, items: None }),
("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }),
("query".into(), ToolParam { name: "query".into(), param_type: "string".into(), description: Some("JSONPath-like query (e.g. $.config.items[0].name) to extract a subset of the document".into()), required: false, properties: None, items: None }),
("schema_depth".into(), ToolParam { name: "schema_depth".into(), param_type: "integer".into(), description: Some("How deep to infer the JSON schema (default: 4)".into()), required: false, properties: None, items: None }),
("pretty".into(), ToolParam { name: "pretty".into(), param_type: "boolean".into(), description: Some("Pretty-print the output (default: false)".into()), required: false, properties: None, items: None }),
]);
let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
registry.register(
ToolDefinition::new("read_json")
.description("Parse, validate, and query JSON and JSONC files. Supports JSONPath-like queries ($.key, $.arr[0]), schema inference, and pretty-printing. Automatically detects JSONC (with // comments).")
.parameters(schema),
ToolHandler::new(|ctx, args| {
let gctx = GitToolCtx::new(ctx);
Box::pin(async move {
read_json_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
})
}),
);
}