gitdataai/libs/service/file_tools/word.rs

//! read_word — parse and extract text from Word documents (.docx) via zip+xml.

use crate::file_tools::MAX_FILE_SIZE;
use crate::git_tools::ctx::GitToolCtx;
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
use futures::FutureExt;
use quick_xml::events::Event;
use quick_xml::Reader;
use std::collections::HashMap;
use zip::ZipArchive;

async fn read_word_exec(
    ctx: GitToolCtx,
    args: serde_json::Value,
) -> Result<serde_json::Value, String> {
    let p: serde_json::Map<String, serde_json::Value> =
        serde_json::from_value(args).map_err(|e| e.to_string())?;

    let project_name = p
        .get("project_name")
        .and_then(|v| v.as_str())
        .ok_or("missing project_name")?;
    let repo_name = p
        .get("repo_name")
        .and_then(|v| v.as_str())
        .ok_or("missing repo_name")?;
    let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
    let rev = p
        .get("rev")
        .and_then(|v| v.as_str())
        .map(String::from)
        .unwrap_or_else(|| "HEAD".to_string());
    let offset = p.get("offset").and_then(|v| v.as_u64()).unwrap_or(0) as usize;
    let limit = p
        .get("limit")
        .and_then(|v| v.as_u64())
        .unwrap_or(200) as usize;
    let sections_only = p
        .get("sections_only")
        .and_then(|v| v.as_bool())
        .unwrap_or(false);

    let domain = ctx.open_repo(project_name, repo_name).await?;

    let commit_oid = if rev.len() >= 40 {
        git::commit::types::CommitOid::new(&rev)
    } else {
        domain
            .commit_get_prefix(&rev)
            .map_err(|e| e.to_string())?
            .oid
    };

    let entry = domain
        .tree_entry_by_path_from_commit(&commit_oid, path)
        .map_err(|e| e.to_string())?;
    let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;

    let data = &content.content;
    if data.len() > MAX_FILE_SIZE {
        return Err(format!(
            "file too large ({} bytes), max {} bytes",
            data.len(),
            MAX_FILE_SIZE
        ));
    }

    // DOCX is a ZIP archive. Read word/document.xml from it.
    let cursor = std::io::Cursor::new(data);
    let mut archive = ZipArchive::new(cursor).map_err(|e| {
        format!(
            "failed to open docx as ZIP archive: {}. Make sure the file is a valid .docx document.",
            e
        )
    })?;

    let doc_xml = {
        let file = if let Ok(f) = archive.by_name("word/document.xml") {
            f
        } else {
            archive.by_name("document.xml")
                .map_err(|_| "docx archive does not contain word/document.xml or document.xml")?
        };
        let mut s = String::new();
        let mut reader = std::io::BufReader::new(file);
        std::io::Read::read_to_string(&mut reader, &mut s)
            .map_err(|e| format!("failed to read document.xml: {}", e))?;
        s
    };

    // Parse paragraphs from <w:p> elements
    let mut reader = Reader::from_str(&doc_xml);
    reader.config_mut().trim_text(false);

    let mut paragraphs: Vec<String> = Vec::new();
    let mut buf = Vec::new();
    let mut in_paragraph = false;
    let mut current_text = String::new();

    loop {
        match reader.read_event_into(&mut buf) {
            Ok(Event::Start(e)) => {
                if e.name().as_ref() == b"w:p" {
                    in_paragraph = true;
                    current_text.clear();
                }
            }
            Ok(Event::Text(e)) => {
                if in_paragraph {
                    let txt = e.unescape().map(|s| s.into_owned()).unwrap_or_default();
                    current_text.push_str(&txt);
                }
            }
            Ok(Event::End(e)) => {
                if e.name().as_ref() == b"w:p" && in_paragraph {
                    in_paragraph = false;
                    let text = current_text.trim().to_string();
                    if !text.is_empty() {
                        paragraphs.push(text);
                    }
                }
            }
            Ok(Event::Eof) => break,
            _ => {}
        }
        buf.clear();
    }

    let total = paragraphs.len();

    let body: Vec<serde_json::Value> = if sections_only {
        paragraphs
            .iter()
            .enumerate()
            .filter(|(_, text)| {
                text.chars().next().map(|c| c.is_uppercase()).unwrap_or(false)
                    && text.chars().filter(|&c| c == ' ').count() < text.len() / 2
                    && text.len() < 200
            })
            .skip(offset)
            .take(limit)
            .map(|(i, t)| serde_json::json!({ "index": i, "text": t }))
            .collect()
    } else {
        paragraphs
            .iter()
            .skip(offset)
            .take(limit)
            .enumerate()
            .map(|(i, t)| serde_json::json!({ "index": offset + i, "text": t }))
            .collect()
    };

    Ok(serde_json::json!({
        "path": path,
        "rev": rev,
        "paragraph_count": total,
        "paragraphs": body,
    }))
}

pub fn register_word_tools(registry: &mut ToolRegistry) {
    let p = HashMap::from([
        ("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
        ("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
        ("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the .docx document".into()), required: true, properties: None, items: None }),
        ("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }),
        ("sections_only".into(), ToolParam { name: "sections_only".into(), param_type: "boolean".into(), description: Some("If true, extract only section/heading-like paragraphs (short lines starting with uppercase)".into()), required: false, properties: None, items: None }),
        ("offset".into(), ToolParam { name: "offset".into(), param_type: "integer".into(), description: Some("Number of paragraphs to skip (default: 0)".into()), required: false, properties: None, items: None }),
        ("limit".into(), ToolParam { name: "limit".into(), param_type: "integer".into(), description: Some("Maximum paragraphs to return (default: 200)".into()), required: false, properties: None, items: None }),
    ]);
    let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
    registry.register(
        ToolDefinition::new("read_word")
            .description("Parse and extract text from Word documents (.docx). Returns paragraphs with index and text content. Supports pagination.")
            .parameters(schema),
        ToolHandler::new(|ctx, args| {
            let gctx = GitToolCtx::new(ctx);
            Box::pin(async move {
                read_word_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
            })
        }),
    );
}