//! read_word — parse and extract text from Word documents (.docx) via zip+xml. use crate::file_tools::MAX_FILE_SIZE; use crate::git_tools::ctx::GitToolCtx; use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema}; use futures::FutureExt; use quick_xml::events::Event; use quick_xml::Reader; use std::collections::HashMap; use zip::ZipArchive; async fn read_word_exec( ctx: GitToolCtx, args: serde_json::Value, ) -> Result { let p: serde_json::Map = serde_json::from_value(args).map_err(|e| e.to_string())?; let project_name = p .get("project_name") .and_then(|v| v.as_str()) .ok_or("missing project_name")?; let repo_name = p .get("repo_name") .and_then(|v| v.as_str()) .ok_or("missing repo_name")?; let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?; let rev = p .get("rev") .and_then(|v| v.as_str()) .map(String::from) .unwrap_or_else(|| "HEAD".to_string()); let offset = p.get("offset").and_then(|v| v.as_u64()).unwrap_or(0) as usize; let limit = p .get("limit") .and_then(|v| v.as_u64()) .unwrap_or(200) as usize; let sections_only = p .get("sections_only") .and_then(|v| v.as_bool()) .unwrap_or(false); let domain = ctx.open_repo(project_name, repo_name).await?; let commit_oid = if rev.len() >= 40 { git::commit::types::CommitOid::new(&rev) } else { domain .commit_get_prefix(&rev) .map_err(|e| e.to_string())? .oid }; let entry = domain .tree_entry_by_path_from_commit(&commit_oid, path) .map_err(|e| e.to_string())?; let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?; let data = &content.content; if data.len() > MAX_FILE_SIZE { return Err(format!( "file too large ({} bytes), max {} bytes", data.len(), MAX_FILE_SIZE )); } // DOCX is a ZIP archive. Read word/document.xml from it. let cursor = std::io::Cursor::new(data); let mut archive = ZipArchive::new(cursor).map_err(|e| { format!( "failed to open docx as ZIP archive: {}. Make sure the file is a valid .docx document.", e ) })?; let doc_xml = { let file = if let Ok(f) = archive.by_name("word/document.xml") { f } else { archive.by_name("document.xml") .map_err(|_| "docx archive does not contain word/document.xml or document.xml")? }; let mut s = String::new(); let mut reader = std::io::BufReader::new(file); std::io::Read::read_to_string(&mut reader, &mut s) .map_err(|e| format!("failed to read document.xml: {}", e))?; s }; // Parse paragraphs from elements let mut reader = Reader::from_str(&doc_xml); reader.config_mut().trim_text(false); let mut paragraphs: Vec = Vec::new(); let mut buf = Vec::new(); let mut in_paragraph = false; let mut current_text = String::new(); loop { match reader.read_event_into(&mut buf) { Ok(Event::Start(e)) => { if e.name().as_ref() == b"w:p" { in_paragraph = true; current_text.clear(); } } Ok(Event::Text(e)) => { if in_paragraph { let txt = e.unescape().map(|s| s.into_owned()).unwrap_or_default(); current_text.push_str(&txt); } } Ok(Event::End(e)) => { if e.name().as_ref() == b"w:p" && in_paragraph { in_paragraph = false; let text = current_text.trim().to_string(); if !text.is_empty() { paragraphs.push(text); } } } Ok(Event::Eof) => break, _ => {} } buf.clear(); } let total = paragraphs.len(); let body: Vec = if sections_only { paragraphs .iter() .enumerate() .filter(|(_, text)| { text.chars().next().map(|c| c.is_uppercase()).unwrap_or(false) && text.chars().filter(|&c| c == ' ').count() < text.len() / 2 && text.len() < 200 }) .skip(offset) .take(limit) .map(|(i, t)| serde_json::json!({ "index": i, "text": t })) .collect() } else { paragraphs .iter() .skip(offset) .take(limit) .enumerate() .map(|(i, t)| serde_json::json!({ "index": offset + i, "text": t })) .collect() }; Ok(serde_json::json!({ "path": path, "rev": rev, "paragraph_count": total, "paragraphs": body, })) } pub fn register_word_tools(registry: &mut ToolRegistry) { let p = HashMap::from([ ("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }), ("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }), ("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the .docx document".into()), required: true, properties: None, items: None }), ("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }), ("sections_only".into(), ToolParam { name: "sections_only".into(), param_type: "boolean".into(), description: Some("If true, extract only section/heading-like paragraphs (short lines starting with uppercase)".into()), required: false, properties: None, items: None }), ("offset".into(), ToolParam { name: "offset".into(), param_type: "integer".into(), description: Some("Number of paragraphs to skip (default: 0)".into()), required: false, properties: None, items: None }), ("limit".into(), ToolParam { name: "limit".into(), param_type: "integer".into(), description: Some("Maximum paragraphs to return (default: 200)".into()), required: false, properties: None, items: None }), ]); let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) }; registry.register( ToolDefinition::new("read_word") .description("Parse and extract text from Word documents (.docx). Returns paragraphs with index and text content. Supports pagination.") .parameters(schema), ToolHandler::new(|ctx, args| { let gctx = GitToolCtx::new(ctx); Box::pin(async move { read_word_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError) }) }), ); }