Add AI-accessible tools for reading structured files (CSV, JSON/JSONC, Markdown, SQL) and searching repository content (git_grep). Also adds git_blob_get to retrieve raw blob text content with binary detection. Deferred: Excel, Word, PDF, PPT modules are stubbed out due to library API incompatibilities (calamine 0.26, lopdf 0.34, quick-xml 0.37).
185 lines
7.9 KiB
Rust
185 lines
7.9 KiB
Rust
//! read_excel — parse and query Excel files (.xlsx, .xls).
|
|
|
|
use crate::file_tools::MAX_FILE_SIZE;
|
|
use crate::git_tools::ctx::GitToolCtx;
|
|
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
|
|
use calamine::{open_workbook, Reader, Xlsx};
|
|
use futures::FutureExt;
|
|
use std::collections::HashMap;
|
|
|
|
async fn read_excel_exec(
|
|
ctx: GitToolCtx,
|
|
args: serde_json::Value,
|
|
) -> Result<serde_json::Value, String> {
|
|
let p: serde_json::Map<String, serde_json::Value> =
|
|
serde_json::from_value(args).map_err(|e| e.to_string())?;
|
|
|
|
let project_name = p
|
|
.get("project_name")
|
|
.and_then(|v| v.as_str())
|
|
.ok_or("missing project_name")?;
|
|
let repo_name = p
|
|
.get("repo_name")
|
|
.and_then(|v| v.as_str())
|
|
.ok_or("missing repo_name")?;
|
|
let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
|
|
let rev = p
|
|
.get("rev")
|
|
.and_then(|v| v.as_str())
|
|
.map(String::from)
|
|
.unwrap_or_else(|| "HEAD".to_string());
|
|
let sheet_name = p.get("sheet_name").and_then(|v| v.as_str()).map(String::from);
|
|
let sheet_index = p.get("sheet_index").and_then(|v| v.as_u64()).map(|v| v as usize);
|
|
let offset = p.get("offset").and_then(|v| v.as_u64()).unwrap_or(0) as usize;
|
|
let limit = p
|
|
.get("limit")
|
|
.and_then(|v| v.as_u64())
|
|
.unwrap_or(100) as usize;
|
|
let has_header = p
|
|
.get("has_header")
|
|
.and_then(|v| v.as_bool())
|
|
.unwrap_or(true);
|
|
|
|
let domain = ctx.open_repo(project_name, repo_name).await?;
|
|
|
|
let commit_oid = if rev.len() >= 40 {
|
|
git::commit::types::CommitOid::new(&rev)
|
|
} else {
|
|
domain
|
|
.commit_get_prefix(&rev)
|
|
.map_err(|e| e.to_string())?
|
|
.oid
|
|
};
|
|
|
|
let entry = domain
|
|
.tree_entry_by_path_from_commit(&commit_oid, path)
|
|
.map_err(|e| e.to_string())?;
|
|
let blob = domain.blob_get(&entry.oid).map_err(|e| e.to_string())?;
|
|
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
|
|
|
|
let data = &content.content;
|
|
if data.len() > MAX_FILE_SIZE {
|
|
return Err(format!(
|
|
"file too large ({} bytes), max {} bytes",
|
|
data.len(),
|
|
MAX_FILE_SIZE
|
|
));
|
|
}
|
|
|
|
// Use cursor-based reading to avoid tempfile
|
|
let cursor = std::io::Cursor::new(data.clone());
|
|
let mut workbook: Xlsx<std::io::Cursor<Vec<u8>>> =
|
|
open_workbook(cursor).map_err(|e| format!("failed to open Excel: {}", e))?;
|
|
|
|
let sheet_names = workbook.sheet_names().to_vec();
|
|
|
|
// Determine which sheet to read
|
|
let sheet_idx = match (sheet_name.clone(), sheet_index) {
|
|
(Some(name), _) => sheet_names
|
|
.iter()
|
|
.position(|n| n == &name)
|
|
.ok_or_else(|| format!("sheet '{}' not found. Available: {:?}", name, sheet_names))?,
|
|
(_, Some(idx)) => {
|
|
if idx >= sheet_names.len() {
|
|
return Err(format!(
|
|
"sheet index {} out of range (0..{})",
|
|
idx,
|
|
sheet_names.len()
|
|
));
|
|
}
|
|
idx
|
|
}
|
|
_ => 0,
|
|
};
|
|
|
|
let range = workbook
|
|
.worksheet_range_at(sheet_idx)
|
|
.map_err(|e| format!("failed to read sheet: {}", e))?;
|
|
|
|
let rows: Vec<Vec<serde_json::Value>> = range
|
|
.rows()
|
|
.skip(if has_header { offset + 1 } else { offset })
|
|
.take(limit)
|
|
.map(|row| {
|
|
row.iter()
|
|
.map(|cell| {
|
|
use calamine::Data;
|
|
match cell {
|
|
Data::Int(i) => serde_json::Value::Number((*i).into()),
|
|
Data::Float(f) => {
|
|
serde_json::json!(f)
|
|
}
|
|
Data::String(s) => serde_json::Value::String(s.clone()),
|
|
Data::Bool(b) => serde_json::Value::Bool(*b),
|
|
Data::DateTime(dt) => {
|
|
serde_json::Value::String(format!("{:?}", dt))
|
|
}
|
|
Data::DateTimeIso(s) => serde_json::Value::String(s.clone()),
|
|
Data::DurationIso(s) => serde_json::Value::String(s.clone()),
|
|
Data::Error(e) => serde_json::json!({ "error": format!("{:?}", e) }),
|
|
Data::Empty => serde_json::Value::Null,
|
|
}
|
|
})
|
|
.collect()
|
|
})
|
|
.collect();
|
|
|
|
let header_row: Vec<String> = if has_header {
|
|
range
|
|
.rows()
|
|
.next()
|
|
.map(|row| {
|
|
row.iter()
|
|
.map(|c| {
|
|
if let calamine::Data::String(s) = c {
|
|
s.clone()
|
|
} else {
|
|
String::new()
|
|
}
|
|
})
|
|
.collect()
|
|
})
|
|
.unwrap_or_default()
|
|
} else {
|
|
vec![]
|
|
};
|
|
|
|
Ok(serde_json::json!({
|
|
"path": path,
|
|
"rev": rev,
|
|
"sheets": sheet_names,
|
|
"active_sheet": sheet_names.get(sheet_idx).cloned(),
|
|
"sheet_index": sheet_idx,
|
|
"headers": header_row,
|
|
"rows": rows,
|
|
"row_count": rows.len(),
|
|
"total_rows": range.rows().count().saturating_sub(if has_header { 1 } else { 0 }),
|
|
}))
|
|
}
|
|
|
|
pub fn register_excel_tools(registry: &mut ToolRegistry) {
|
|
let p = HashMap::from([
|
|
("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
|
|
("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
|
|
("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path within the repository (supports .xlsx, .xls)".into()), required: true, properties: None, items: None }),
|
|
("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }),
|
|
("sheet_name".into(), ToolParam { name: "sheet_name".into(), param_type: "string".into(), description: Some("Sheet name to read. Defaults to first sheet.".into()), required: false, properties: None, items: None }),
|
|
("sheet_index".into(), ToolParam { name: "sheet_index".into(), param_type: "integer".into(), description: Some("Sheet index (0-based). Ignored if sheet_name is set.".into()), required: false, properties: None, items: None }),
|
|
("has_header".into(), ToolParam { name: "has_header".into(), param_type: "boolean".into(), description: Some("If true, first row is column headers (default: true)".into()), required: false, properties: None, items: None }),
|
|
("offset".into(), ToolParam { name: "offset".into(), param_type: "integer".into(), description: Some("Number of rows to skip (default: 0)".into()), required: false, properties: None, items: None }),
|
|
("limit".into(), ToolParam { name: "limit".into(), param_type: "integer".into(), description: Some("Maximum rows to return (default: 100)".into()), required: false, properties: None, items: None }),
|
|
]);
|
|
let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
|
|
registry.register(
|
|
ToolDefinition::new("read_excel")
|
|
.description("Parse and query Excel spreadsheets (.xlsx, .xls). Returns sheet names, headers, and rows with support for sheet selection and pagination.")
|
|
.parameters(schema),
|
|
ToolHandler::new(|ctx, args| {
|
|
let gctx = GitToolCtx::new(ctx);
|
|
Box::pin(async move {
|
|
read_excel_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
|
|
})
|
|
}),
|
|
);
|
|
}
|