//! read_csv — parse and query CSV files. use crate::file_tools::MAX_FILE_SIZE; use crate::git_tools::ctx::GitToolCtx; use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema}; use csv::ReaderBuilder; use std::collections::HashMap; async fn read_csv_exec( ctx: GitToolCtx, args: serde_json::Value, ) -> Result { let p: serde_json::Map = serde_json::from_value(args).map_err(|e| e.to_string())?; let project_name = p .get("project_name") .and_then(|v| v.as_str()) .ok_or("missing project_name")?; let repo_name = p .get("repo_name") .and_then(|v| v.as_str()) .ok_or("missing repo_name")?; let path = p .get("path") .and_then(|v| v.as_str()) .ok_or("missing path")?; let rev = p .get("rev") .and_then(|v| v.as_str()) .map(String::from) .unwrap_or_else(|| "HEAD".to_string()); let delimiter = p .get("delimiter") .and_then(|v| v.as_str()) .and_then(|s| s.chars().next()) .unwrap_or(','); let has_header = p .get("has_header") .and_then(|v| v.as_bool()) .unwrap_or(true); let offset = p.get("offset").and_then(|v| v.as_u64()).unwrap_or(0) as usize; let limit = p.get("limit").and_then(|v| v.as_u64()).unwrap_or(100) as usize; let filter_col = p.get("filter_column").and_then(|v| v.as_str()); let filter_val = p.get("filter_value").and_then(|v| v.as_str()); let select_cols = p.get("columns").and_then(|v| v.as_array()).map(|a| { a.iter() .filter_map(|v| v.as_str().map(String::from)) .collect::>() }); let domain = ctx.open_repo(project_name, repo_name).await?; let commit_oid = if rev.len() >= 40 { git::commit::types::CommitOid::new(&rev) } else { domain .commit_get_prefix(&rev) .map_err(|e| e.to_string())? .oid }; let entry = domain .tree_entry_by_path_from_commit(&commit_oid, path) .map_err(|e| e.to_string())?; let blob = domain.blob_get(&entry.oid).map_err(|e| e.to_string())?; if blob.is_binary { return Err("file is binary, not a CSV".to_string()); } let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?; let data = &content.content; if data.len() > MAX_FILE_SIZE { return Err(format!( "file too large ({} bytes), max {} bytes", data.len(), MAX_FILE_SIZE )); } let text = String::from_utf8_lossy(data); let mut reader = ReaderBuilder::new() .delimiter(delimiter as u8) .has_headers(has_header) .from_reader(text.as_bytes()); let headers: Vec = if has_header { reader .headers() .map_err(|e| e.to_string())? .clone() .into_iter() .map(String::from) .collect() } else { vec![] }; let col_indices: Vec = if let Some(ref sel) = select_cols { sel.iter() .filter_map(|col| headers.iter().position(|h| h == col)) .collect() } else { (0..headers.len()).collect() }; let _col_set: std::collections::HashSet = col_indices.iter().cloned().collect(); let filter_col_idx = filter_col.and_then(|c| headers.iter().position(|h| h == c)); let mut rows: Vec = Vec::new(); let mut skipped = 0; let mut total = 0; for result in reader.records() { let record = result.map_err(|e| e.to_string())?; // Skip offset if skipped < offset { skipped += 1; continue; } total += 1; // Filter if let (Some(fci), Some(fv)) = (filter_col_idx, filter_val) { if record.get(fci) != Some(fv) { continue; } } // Select columns let obj = if has_header { let mut map = serde_json::Map::new(); for &idx in &col_indices { let key = headers .get(idx) .cloned() .unwrap_or_else(|| format!("col_{}", idx)); let val = record.get(idx).unwrap_or("").to_string(); map.insert(key, serde_json::Value::String(val)); } serde_json::Value::Object(map) } else { let arr: Vec = col_indices .iter() .map(|&idx| record.get(idx).unwrap_or("").to_string()) .collect(); serde_json::Value::Array(arr.into_iter().map(serde_json::Value::String).collect()) }; rows.push(obj); if rows.len() >= limit { break; } } Ok(serde_json::json!({ "path": path, "rev": rev, "headers": if has_header { headers } else { vec![] }, "selected_columns": select_cols, "rows": rows, "row_count": rows.len(), "total_available": total + offset, "filter": if let (Some(c), Some(v)) = (filter_col, filter_val) { serde_json::json!({ "column": c, "value": v }) } else { serde_json::Value::Null }, })) } pub fn register_csv_tools(registry: &mut ToolRegistry) { let p = HashMap::from([ ( "project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None, }, ), ( "repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None, }, ), ( "path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path within the repository".into()), required: true, properties: None, items: None, }, ), ( "rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None, }, ), ( "delimiter".into(), ToolParam { name: "delimiter".into(), param_type: "string".into(), description: Some("Field delimiter character (default: comma \",\")".into()), required: false, properties: None, items: None, }, ), ( "has_header".into(), ToolParam { name: "has_header".into(), param_type: "boolean".into(), description: Some("If true, first row is column headers (default: true)".into()), required: false, properties: None, items: None, }, ), ( "columns".into(), ToolParam { name: "columns".into(), param_type: "array".into(), description: Some("List of column names to select".into()), required: false, properties: None, items: Some(Box::new(ToolParam { name: "".into(), param_type: "string".into(), description: None, required: false, properties: None, items: None, })), }, ), ( "filter_column".into(), ToolParam { name: "filter_column".into(), param_type: "string".into(), description: Some("Column name to filter by".into()), required: false, properties: None, items: None, }, ), ( "filter_value".into(), ToolParam { name: "filter_value".into(), param_type: "string".into(), description: Some("Value to match in filter_column".into()), required: false, properties: None, items: None, }, ), ( "offset".into(), ToolParam { name: "offset".into(), param_type: "integer".into(), description: Some("Number of rows to skip (default: 0)".into()), required: false, properties: None, items: None, }, ), ( "limit".into(), ToolParam { name: "limit".into(), param_type: "integer".into(), description: Some("Maximum rows to return (default: 100)".into()), required: false, properties: None, items: None, }, ), ]); let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec![ "project_name".into(), "repo_name".into(), "path".into(), ]), }; registry.register( ToolDefinition::new("read_csv") .description("Parse and query a CSV file. Supports header detection, column selection, filtering, pagination (offset/limit), and custom delimiters.") .parameters(schema), ToolHandler::new(|ctx, args| { let gctx = GitToolCtx::new(ctx); Box::pin(async move { read_csv_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError) }) }), ); }