gitdataai/libs/service/file_tools/csv.rs
ZhenYi 1af796ac75 feat(service): add file_tools module and git_blob_get tool
Add AI-accessible tools for reading structured files (CSV, JSON/JSONC,
Markdown, SQL) and searching repository content (git_grep). Also adds
git_blob_get to retrieve raw blob text content with binary detection.

Deferred: Excel, Word, PDF, PPT modules are stubbed out due to library
API incompatibilities (calamine 0.26, lopdf 0.34, quick-xml 0.37).
2026-04-18 23:02:10 +08:00

326 lines
10 KiB
Rust

//! read_csv — parse and query CSV files.
use crate::file_tools::MAX_FILE_SIZE;
use crate::git_tools::ctx::GitToolCtx;
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
use csv::ReaderBuilder;
use std::collections::HashMap;
async fn read_csv_exec(
ctx: GitToolCtx,
args: serde_json::Value,
) -> Result<serde_json::Value, String> {
let p: serde_json::Map<String, serde_json::Value> =
serde_json::from_value(args).map_err(|e| e.to_string())?;
let project_name = p
.get("project_name")
.and_then(|v| v.as_str())
.ok_or("missing project_name")?;
let repo_name = p
.get("repo_name")
.and_then(|v| v.as_str())
.ok_or("missing repo_name")?;
let path = p
.get("path")
.and_then(|v| v.as_str())
.ok_or("missing path")?;
let rev = p
.get("rev")
.and_then(|v| v.as_str())
.map(String::from)
.unwrap_or_else(|| "HEAD".to_string());
let delimiter = p
.get("delimiter")
.and_then(|v| v.as_str())
.and_then(|s| s.chars().next())
.unwrap_or(',');
let has_header = p
.get("has_header")
.and_then(|v| v.as_bool())
.unwrap_or(true);
let offset = p.get("offset").and_then(|v| v.as_u64()).unwrap_or(0) as usize;
let limit = p.get("limit").and_then(|v| v.as_u64()).unwrap_or(100) as usize;
let filter_col = p.get("filter_column").and_then(|v| v.as_str());
let filter_val = p.get("filter_value").and_then(|v| v.as_str());
let select_cols = p.get("columns").and_then(|v| v.as_array()).map(|a| {
a.iter()
.filter_map(|v| v.as_str().map(String::from))
.collect::<Vec<_>>()
});
let domain = ctx.open_repo(project_name, repo_name).await?;
let commit_oid = if rev.len() >= 40 {
git::commit::types::CommitOid::new(&rev)
} else {
domain
.commit_get_prefix(&rev)
.map_err(|e| e.to_string())?
.oid
};
let entry = domain
.tree_entry_by_path_from_commit(&commit_oid, path)
.map_err(|e| e.to_string())?;
let blob = domain.blob_get(&entry.oid).map_err(|e| e.to_string())?;
if blob.is_binary {
return Err("file is binary, not a CSV".to_string());
}
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
let data = &content.content;
if data.len() > MAX_FILE_SIZE {
return Err(format!(
"file too large ({} bytes), max {} bytes",
data.len(),
MAX_FILE_SIZE
));
}
let text = String::from_utf8_lossy(data);
let mut reader = ReaderBuilder::new()
.delimiter(delimiter as u8)
.has_headers(has_header)
.from_reader(text.as_bytes());
let headers: Vec<String> = if has_header {
reader
.headers()
.map_err(|e| e.to_string())?
.clone()
.into_iter()
.map(String::from)
.collect()
} else {
vec![]
};
let col_indices: Vec<usize> = if let Some(ref sel) = select_cols {
sel.iter()
.filter_map(|col| headers.iter().position(|h| h == col))
.collect()
} else {
(0..headers.len()).collect()
};
let _col_set: std::collections::HashSet<usize> = col_indices.iter().cloned().collect();
let filter_col_idx = filter_col.and_then(|c| headers.iter().position(|h| h == c));
let mut rows: Vec<serde_json::Value> = Vec::new();
let mut skipped = 0;
let mut total = 0;
for result in reader.records() {
let record = result.map_err(|e| e.to_string())?;
// Skip offset
if skipped < offset {
skipped += 1;
continue;
}
total += 1;
// Filter
if let (Some(fci), Some(fv)) = (filter_col_idx, filter_val) {
if record.get(fci) != Some(fv) {
continue;
}
}
// Select columns
let obj = if has_header {
let mut map = serde_json::Map::new();
for &idx in &col_indices {
let key = headers
.get(idx)
.cloned()
.unwrap_or_else(|| format!("col_{}", idx));
let val = record.get(idx).unwrap_or("").to_string();
map.insert(key, serde_json::Value::String(val));
}
serde_json::Value::Object(map)
} else {
let arr: Vec<String> = col_indices
.iter()
.map(|&idx| record.get(idx).unwrap_or("").to_string())
.collect();
serde_json::Value::Array(arr.into_iter().map(serde_json::Value::String).collect())
};
rows.push(obj);
if rows.len() >= limit {
break;
}
}
Ok(serde_json::json!({
"path": path,
"rev": rev,
"headers": if has_header { headers } else { vec![] },
"selected_columns": select_cols,
"rows": rows,
"row_count": rows.len(),
"total_available": total + offset,
"filter": if let (Some(c), Some(v)) = (filter_col, filter_val) {
serde_json::json!({ "column": c, "value": v })
} else { serde_json::Value::Null },
}))
}
pub fn register_csv_tools(registry: &mut ToolRegistry) {
let p = HashMap::from([
(
"project_name".into(),
ToolParam {
name: "project_name".into(),
param_type: "string".into(),
description: Some("Project name (slug)".into()),
required: true,
properties: None,
items: None,
},
),
(
"repo_name".into(),
ToolParam {
name: "repo_name".into(),
param_type: "string".into(),
description: Some("Repository name".into()),
required: true,
properties: None,
items: None,
},
),
(
"path".into(),
ToolParam {
name: "path".into(),
param_type: "string".into(),
description: Some("File path within the repository".into()),
required: true,
properties: None,
items: None,
},
),
(
"rev".into(),
ToolParam {
name: "rev".into(),
param_type: "string".into(),
description: Some("Git revision (default: HEAD)".into()),
required: false,
properties: None,
items: None,
},
),
(
"delimiter".into(),
ToolParam {
name: "delimiter".into(),
param_type: "string".into(),
description: Some("Field delimiter character (default: comma \",\")".into()),
required: false,
properties: None,
items: None,
},
),
(
"has_header".into(),
ToolParam {
name: "has_header".into(),
param_type: "boolean".into(),
description: Some("If true, first row is column headers (default: true)".into()),
required: false,
properties: None,
items: None,
},
),
(
"columns".into(),
ToolParam {
name: "columns".into(),
param_type: "array".into(),
description: Some("List of column names to select".into()),
required: false,
properties: None,
items: Some(Box::new(ToolParam {
name: "".into(),
param_type: "string".into(),
description: None,
required: false,
properties: None,
items: None,
})),
},
),
(
"filter_column".into(),
ToolParam {
name: "filter_column".into(),
param_type: "string".into(),
description: Some("Column name to filter by".into()),
required: false,
properties: None,
items: None,
},
),
(
"filter_value".into(),
ToolParam {
name: "filter_value".into(),
param_type: "string".into(),
description: Some("Value to match in filter_column".into()),
required: false,
properties: None,
items: None,
},
),
(
"offset".into(),
ToolParam {
name: "offset".into(),
param_type: "integer".into(),
description: Some("Number of rows to skip (default: 0)".into()),
required: false,
properties: None,
items: None,
},
),
(
"limit".into(),
ToolParam {
name: "limit".into(),
param_type: "integer".into(),
description: Some("Maximum rows to return (default: 100)".into()),
required: false,
properties: None,
items: None,
},
),
]);
let schema = ToolSchema {
schema_type: "object".into(),
properties: Some(p),
required: Some(vec![
"project_name".into(),
"repo_name".into(),
"path".into(),
]),
};
registry.register(
ToolDefinition::new("read_csv")
.description("Parse and query a CSV file. Supports header detection, column selection, filtering, pagination (offset/limit), and custom delimiters.")
.parameters(schema),
ToolHandler::new(|ctx, args| {
let gctx = GitToolCtx::new(ctx);
Box::pin(async move {
read_csv_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
})
}),
);
}