Add AI-accessible tools for reading structured files (CSV, JSON/JSONC, Markdown, SQL) and searching repository content (git_grep). Also adds git_blob_get to retrieve raw blob text content with binary detection. Deferred: Excel, Word, PDF, PPT modules are stubbed out due to library API incompatibilities (calamine 0.26, lopdf 0.34, quick-xml 0.37).
326 lines
10 KiB
Rust
326 lines
10 KiB
Rust
//! read_csv — parse and query CSV files.
|
|
|
|
use crate::file_tools::MAX_FILE_SIZE;
|
|
use crate::git_tools::ctx::GitToolCtx;
|
|
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
|
|
use csv::ReaderBuilder;
|
|
use std::collections::HashMap;
|
|
|
|
async fn read_csv_exec(
|
|
ctx: GitToolCtx,
|
|
args: serde_json::Value,
|
|
) -> Result<serde_json::Value, String> {
|
|
let p: serde_json::Map<String, serde_json::Value> =
|
|
serde_json::from_value(args).map_err(|e| e.to_string())?;
|
|
|
|
let project_name = p
|
|
.get("project_name")
|
|
.and_then(|v| v.as_str())
|
|
.ok_or("missing project_name")?;
|
|
let repo_name = p
|
|
.get("repo_name")
|
|
.and_then(|v| v.as_str())
|
|
.ok_or("missing repo_name")?;
|
|
let path = p
|
|
.get("path")
|
|
.and_then(|v| v.as_str())
|
|
.ok_or("missing path")?;
|
|
let rev = p
|
|
.get("rev")
|
|
.and_then(|v| v.as_str())
|
|
.map(String::from)
|
|
.unwrap_or_else(|| "HEAD".to_string());
|
|
let delimiter = p
|
|
.get("delimiter")
|
|
.and_then(|v| v.as_str())
|
|
.and_then(|s| s.chars().next())
|
|
.unwrap_or(',');
|
|
let has_header = p
|
|
.get("has_header")
|
|
.and_then(|v| v.as_bool())
|
|
.unwrap_or(true);
|
|
let offset = p.get("offset").and_then(|v| v.as_u64()).unwrap_or(0) as usize;
|
|
let limit = p.get("limit").and_then(|v| v.as_u64()).unwrap_or(100) as usize;
|
|
let filter_col = p.get("filter_column").and_then(|v| v.as_str());
|
|
let filter_val = p.get("filter_value").and_then(|v| v.as_str());
|
|
let select_cols = p.get("columns").and_then(|v| v.as_array()).map(|a| {
|
|
a.iter()
|
|
.filter_map(|v| v.as_str().map(String::from))
|
|
.collect::<Vec<_>>()
|
|
});
|
|
|
|
let domain = ctx.open_repo(project_name, repo_name).await?;
|
|
|
|
let commit_oid = if rev.len() >= 40 {
|
|
git::commit::types::CommitOid::new(&rev)
|
|
} else {
|
|
domain
|
|
.commit_get_prefix(&rev)
|
|
.map_err(|e| e.to_string())?
|
|
.oid
|
|
};
|
|
|
|
let entry = domain
|
|
.tree_entry_by_path_from_commit(&commit_oid, path)
|
|
.map_err(|e| e.to_string())?;
|
|
let blob = domain.blob_get(&entry.oid).map_err(|e| e.to_string())?;
|
|
|
|
if blob.is_binary {
|
|
return Err("file is binary, not a CSV".to_string());
|
|
}
|
|
|
|
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
|
|
let data = &content.content;
|
|
if data.len() > MAX_FILE_SIZE {
|
|
return Err(format!(
|
|
"file too large ({} bytes), max {} bytes",
|
|
data.len(),
|
|
MAX_FILE_SIZE
|
|
));
|
|
}
|
|
|
|
let text = String::from_utf8_lossy(data);
|
|
let mut reader = ReaderBuilder::new()
|
|
.delimiter(delimiter as u8)
|
|
.has_headers(has_header)
|
|
.from_reader(text.as_bytes());
|
|
|
|
let headers: Vec<String> = if has_header {
|
|
reader
|
|
.headers()
|
|
.map_err(|e| e.to_string())?
|
|
.clone()
|
|
.into_iter()
|
|
.map(String::from)
|
|
.collect()
|
|
} else {
|
|
vec![]
|
|
};
|
|
|
|
let col_indices: Vec<usize> = if let Some(ref sel) = select_cols {
|
|
sel.iter()
|
|
.filter_map(|col| headers.iter().position(|h| h == col))
|
|
.collect()
|
|
} else {
|
|
(0..headers.len()).collect()
|
|
};
|
|
|
|
let _col_set: std::collections::HashSet<usize> = col_indices.iter().cloned().collect();
|
|
let filter_col_idx = filter_col.and_then(|c| headers.iter().position(|h| h == c));
|
|
|
|
let mut rows: Vec<serde_json::Value> = Vec::new();
|
|
let mut skipped = 0;
|
|
let mut total = 0;
|
|
|
|
for result in reader.records() {
|
|
let record = result.map_err(|e| e.to_string())?;
|
|
|
|
// Skip offset
|
|
if skipped < offset {
|
|
skipped += 1;
|
|
continue;
|
|
}
|
|
|
|
total += 1;
|
|
|
|
// Filter
|
|
if let (Some(fci), Some(fv)) = (filter_col_idx, filter_val) {
|
|
if record.get(fci) != Some(fv) {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
// Select columns
|
|
let obj = if has_header {
|
|
let mut map = serde_json::Map::new();
|
|
for &idx in &col_indices {
|
|
let key = headers
|
|
.get(idx)
|
|
.cloned()
|
|
.unwrap_or_else(|| format!("col_{}", idx));
|
|
let val = record.get(idx).unwrap_or("").to_string();
|
|
map.insert(key, serde_json::Value::String(val));
|
|
}
|
|
serde_json::Value::Object(map)
|
|
} else {
|
|
let arr: Vec<String> = col_indices
|
|
.iter()
|
|
.map(|&idx| record.get(idx).unwrap_or("").to_string())
|
|
.collect();
|
|
serde_json::Value::Array(arr.into_iter().map(serde_json::Value::String).collect())
|
|
};
|
|
|
|
rows.push(obj);
|
|
|
|
if rows.len() >= limit {
|
|
break;
|
|
}
|
|
}
|
|
|
|
Ok(serde_json::json!({
|
|
"path": path,
|
|
"rev": rev,
|
|
"headers": if has_header { headers } else { vec![] },
|
|
"selected_columns": select_cols,
|
|
"rows": rows,
|
|
"row_count": rows.len(),
|
|
"total_available": total + offset,
|
|
"filter": if let (Some(c), Some(v)) = (filter_col, filter_val) {
|
|
serde_json::json!({ "column": c, "value": v })
|
|
} else { serde_json::Value::Null },
|
|
}))
|
|
}
|
|
|
|
pub fn register_csv_tools(registry: &mut ToolRegistry) {
|
|
let p = HashMap::from([
|
|
(
|
|
"project_name".into(),
|
|
ToolParam {
|
|
name: "project_name".into(),
|
|
param_type: "string".into(),
|
|
description: Some("Project name (slug)".into()),
|
|
required: true,
|
|
properties: None,
|
|
items: None,
|
|
},
|
|
),
|
|
(
|
|
"repo_name".into(),
|
|
ToolParam {
|
|
name: "repo_name".into(),
|
|
param_type: "string".into(),
|
|
description: Some("Repository name".into()),
|
|
required: true,
|
|
properties: None,
|
|
items: None,
|
|
},
|
|
),
|
|
(
|
|
"path".into(),
|
|
ToolParam {
|
|
name: "path".into(),
|
|
param_type: "string".into(),
|
|
description: Some("File path within the repository".into()),
|
|
required: true,
|
|
properties: None,
|
|
items: None,
|
|
},
|
|
),
|
|
(
|
|
"rev".into(),
|
|
ToolParam {
|
|
name: "rev".into(),
|
|
param_type: "string".into(),
|
|
description: Some("Git revision (default: HEAD)".into()),
|
|
required: false,
|
|
properties: None,
|
|
items: None,
|
|
},
|
|
),
|
|
(
|
|
"delimiter".into(),
|
|
ToolParam {
|
|
name: "delimiter".into(),
|
|
param_type: "string".into(),
|
|
description: Some("Field delimiter character (default: comma \",\")".into()),
|
|
required: false,
|
|
properties: None,
|
|
items: None,
|
|
},
|
|
),
|
|
(
|
|
"has_header".into(),
|
|
ToolParam {
|
|
name: "has_header".into(),
|
|
param_type: "boolean".into(),
|
|
description: Some("If true, first row is column headers (default: true)".into()),
|
|
required: false,
|
|
properties: None,
|
|
items: None,
|
|
},
|
|
),
|
|
(
|
|
"columns".into(),
|
|
ToolParam {
|
|
name: "columns".into(),
|
|
param_type: "array".into(),
|
|
description: Some("List of column names to select".into()),
|
|
required: false,
|
|
properties: None,
|
|
items: Some(Box::new(ToolParam {
|
|
name: "".into(),
|
|
param_type: "string".into(),
|
|
description: None,
|
|
required: false,
|
|
properties: None,
|
|
items: None,
|
|
})),
|
|
},
|
|
),
|
|
(
|
|
"filter_column".into(),
|
|
ToolParam {
|
|
name: "filter_column".into(),
|
|
param_type: "string".into(),
|
|
description: Some("Column name to filter by".into()),
|
|
required: false,
|
|
properties: None,
|
|
items: None,
|
|
},
|
|
),
|
|
(
|
|
"filter_value".into(),
|
|
ToolParam {
|
|
name: "filter_value".into(),
|
|
param_type: "string".into(),
|
|
description: Some("Value to match in filter_column".into()),
|
|
required: false,
|
|
properties: None,
|
|
items: None,
|
|
},
|
|
),
|
|
(
|
|
"offset".into(),
|
|
ToolParam {
|
|
name: "offset".into(),
|
|
param_type: "integer".into(),
|
|
description: Some("Number of rows to skip (default: 0)".into()),
|
|
required: false,
|
|
properties: None,
|
|
items: None,
|
|
},
|
|
),
|
|
(
|
|
"limit".into(),
|
|
ToolParam {
|
|
name: "limit".into(),
|
|
param_type: "integer".into(),
|
|
description: Some("Maximum rows to return (default: 100)".into()),
|
|
required: false,
|
|
properties: None,
|
|
items: None,
|
|
},
|
|
),
|
|
]);
|
|
let schema = ToolSchema {
|
|
schema_type: "object".into(),
|
|
properties: Some(p),
|
|
required: Some(vec![
|
|
"project_name".into(),
|
|
"repo_name".into(),
|
|
"path".into(),
|
|
]),
|
|
};
|
|
registry.register(
|
|
ToolDefinition::new("read_csv")
|
|
.description("Parse and query a CSV file. Supports header detection, column selection, filtering, pagination (offset/limit), and custom delimiters.")
|
|
.parameters(schema),
|
|
ToolHandler::new(|ctx, args| {
|
|
let gctx = GitToolCtx::new(ctx);
|
|
Box::pin(async move {
|
|
read_csv_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
|
|
})
|
|
}),
|
|
);
|
|
}
|