Add AI-accessible tools for reading structured files (CSV, JSON/JSONC, Markdown, SQL) and searching repository content (git_grep). Also adds git_blob_get to retrieve raw blob text content with binary detection. Deferred: Excel, Word, PDF, PPT modules are stubbed out due to library API incompatibilities (calamine 0.26, lopdf 0.34, quick-xml 0.37).
185 lines
7.2 KiB
Rust
185 lines
7.2 KiB
Rust
//! read_word — parse and extract text from Word documents (.docx) via zip+xml.
|
|
|
|
use crate::file_tools::MAX_FILE_SIZE;
|
|
use crate::git_tools::ctx::GitToolCtx;
|
|
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
|
|
use futures::FutureExt;
|
|
use quick_xml::events::Event;
|
|
use quick_xml::Reader;
|
|
use std::collections::HashMap;
|
|
use zip::ZipArchive;
|
|
|
|
async fn read_word_exec(
|
|
ctx: GitToolCtx,
|
|
args: serde_json::Value,
|
|
) -> Result<serde_json::Value, String> {
|
|
let p: serde_json::Map<String, serde_json::Value> =
|
|
serde_json::from_value(args).map_err(|e| e.to_string())?;
|
|
|
|
let project_name = p
|
|
.get("project_name")
|
|
.and_then(|v| v.as_str())
|
|
.ok_or("missing project_name")?;
|
|
let repo_name = p
|
|
.get("repo_name")
|
|
.and_then(|v| v.as_str())
|
|
.ok_or("missing repo_name")?;
|
|
let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
|
|
let rev = p
|
|
.get("rev")
|
|
.and_then(|v| v.as_str())
|
|
.map(String::from)
|
|
.unwrap_or_else(|| "HEAD".to_string());
|
|
let offset = p.get("offset").and_then(|v| v.as_u64()).unwrap_or(0) as usize;
|
|
let limit = p
|
|
.get("limit")
|
|
.and_then(|v| v.as_u64())
|
|
.unwrap_or(200) as usize;
|
|
let sections_only = p
|
|
.get("sections_only")
|
|
.and_then(|v| v.as_bool())
|
|
.unwrap_or(false);
|
|
|
|
let domain = ctx.open_repo(project_name, repo_name).await?;
|
|
|
|
let commit_oid = if rev.len() >= 40 {
|
|
git::commit::types::CommitOid::new(&rev)
|
|
} else {
|
|
domain
|
|
.commit_get_prefix(&rev)
|
|
.map_err(|e| e.to_string())?
|
|
.oid
|
|
};
|
|
|
|
let entry = domain
|
|
.tree_entry_by_path_from_commit(&commit_oid, path)
|
|
.map_err(|e| e.to_string())?;
|
|
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
|
|
|
|
let data = &content.content;
|
|
if data.len() > MAX_FILE_SIZE {
|
|
return Err(format!(
|
|
"file too large ({} bytes), max {} bytes",
|
|
data.len(),
|
|
MAX_FILE_SIZE
|
|
));
|
|
}
|
|
|
|
// DOCX is a ZIP archive. Read word/document.xml from it.
|
|
let cursor = std::io::Cursor::new(data);
|
|
let mut archive = ZipArchive::new(cursor).map_err(|e| {
|
|
format!(
|
|
"failed to open docx as ZIP archive: {}. Make sure the file is a valid .docx document.",
|
|
e
|
|
)
|
|
})?;
|
|
|
|
let doc_xml = {
|
|
let file = if let Ok(f) = archive.by_name("word/document.xml") {
|
|
f
|
|
} else {
|
|
archive.by_name("document.xml")
|
|
.map_err(|_| "docx archive does not contain word/document.xml or document.xml")?
|
|
};
|
|
let mut s = String::new();
|
|
let mut reader = std::io::BufReader::new(file);
|
|
std::io::Read::read_to_string(&mut reader, &mut s)
|
|
.map_err(|e| format!("failed to read document.xml: {}", e))?;
|
|
s
|
|
};
|
|
|
|
// Parse paragraphs from <w:p> elements
|
|
let mut reader = Reader::from_str(&doc_xml);
|
|
reader.config_mut().trim_text(false);
|
|
|
|
let mut paragraphs: Vec<String> = Vec::new();
|
|
let mut buf = Vec::new();
|
|
let mut in_paragraph = false;
|
|
let mut current_text = String::new();
|
|
|
|
loop {
|
|
match reader.read_event_into(&mut buf) {
|
|
Ok(Event::Start(e)) => {
|
|
if e.name().as_ref() == b"w:p" {
|
|
in_paragraph = true;
|
|
current_text.clear();
|
|
}
|
|
}
|
|
Ok(Event::Text(e)) => {
|
|
if in_paragraph {
|
|
let txt = e.unescape().map(|s| s.into_owned()).unwrap_or_default();
|
|
current_text.push_str(&txt);
|
|
}
|
|
}
|
|
Ok(Event::End(e)) => {
|
|
if e.name().as_ref() == b"w:p" && in_paragraph {
|
|
in_paragraph = false;
|
|
let text = current_text.trim().to_string();
|
|
if !text.is_empty() {
|
|
paragraphs.push(text);
|
|
}
|
|
}
|
|
}
|
|
Ok(Event::Eof) => break,
|
|
_ => {}
|
|
}
|
|
buf.clear();
|
|
}
|
|
|
|
let total = paragraphs.len();
|
|
|
|
let body: Vec<serde_json::Value> = if sections_only {
|
|
paragraphs
|
|
.iter()
|
|
.enumerate()
|
|
.filter(|(_, text)| {
|
|
text.chars().next().map(|c| c.is_uppercase()).unwrap_or(false)
|
|
&& text.chars().filter(|&c| c == ' ').count() < text.len() / 2
|
|
&& text.len() < 200
|
|
})
|
|
.skip(offset)
|
|
.take(limit)
|
|
.map(|(i, t)| serde_json::json!({ "index": i, "text": t }))
|
|
.collect()
|
|
} else {
|
|
paragraphs
|
|
.iter()
|
|
.skip(offset)
|
|
.take(limit)
|
|
.enumerate()
|
|
.map(|(i, t)| serde_json::json!({ "index": offset + i, "text": t }))
|
|
.collect()
|
|
};
|
|
|
|
Ok(serde_json::json!({
|
|
"path": path,
|
|
"rev": rev,
|
|
"paragraph_count": total,
|
|
"paragraphs": body,
|
|
}))
|
|
}
|
|
|
|
pub fn register_word_tools(registry: &mut ToolRegistry) {
|
|
let p = HashMap::from([
|
|
("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
|
|
("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
|
|
("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the .docx document".into()), required: true, properties: None, items: None }),
|
|
("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }),
|
|
("sections_only".into(), ToolParam { name: "sections_only".into(), param_type: "boolean".into(), description: Some("If true, extract only section/heading-like paragraphs (short lines starting with uppercase)".into()), required: false, properties: None, items: None }),
|
|
("offset".into(), ToolParam { name: "offset".into(), param_type: "integer".into(), description: Some("Number of paragraphs to skip (default: 0)".into()), required: false, properties: None, items: None }),
|
|
("limit".into(), ToolParam { name: "limit".into(), param_type: "integer".into(), description: Some("Maximum paragraphs to return (default: 200)".into()), required: false, properties: None, items: None }),
|
|
]);
|
|
let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
|
|
registry.register(
|
|
ToolDefinition::new("read_word")
|
|
.description("Parse and extract text from Word documents (.docx). Returns paragraphs with index and text content. Supports pagination.")
|
|
.parameters(schema),
|
|
ToolHandler::new(|ctx, args| {
|
|
let gctx = GitToolCtx::new(ctx);
|
|
Box::pin(async move {
|
|
read_word_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
|
|
})
|
|
}),
|
|
);
|
|
}
|