gitdataai/libs/service/file_tools/ppt.rs
ZhenYi 1af796ac75 feat(service): add file_tools module and git_blob_get tool
Add AI-accessible tools for reading structured files (CSV, JSON/JSONC,
Markdown, SQL) and searching repository content (git_grep). Also adds
git_blob_get to retrieve raw blob text content with binary detection.

Deferred: Excel, Word, PDF, PPT modules are stubbed out due to library
API incompatibilities (calamine 0.26, lopdf 0.34, quick-xml 0.37).
2026-04-18 23:02:10 +08:00

205 lines
8.1 KiB
Rust

//! read_ppt — extract text from PowerPoint files (.pptx).
use crate::file_tools::MAX_FILE_SIZE;
use crate::git_tools::ctx::GitToolCtx;
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
use futures::FutureExt;
use std::collections::HashMap;
use zip::ZipArchive;
async fn read_ppt_exec(
ctx: GitToolCtx,
args: serde_json::Value,
) -> Result<serde_json::Value, String> {
let p: serde_json::Map<String, serde_json::Value> =
serde_json::from_value(args).map_err(|e| e.to_string())?;
let project_name = p
.get("project_name")
.and_then(|v| v.as_str())
.ok_or("missing project_name")?;
let repo_name = p
.get("repo_name")
.and_then(|v| v.as_str())
.ok_or("missing repo_name")?;
let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
let rev = p
.get("rev")
.and_then(|v| v.as_str())
.map(String::from)
.unwrap_or_else(|| "HEAD".to_string());
let slide_start = p.get("slide_start").and_then(|v| v.as_u64()).map(|v| v as usize);
let slide_end = p.get("slide_end").and_then(|v| v.as_u64()).map(|v| v as usize);
let include_notes = p
.get("include_notes")
.and_then(|v| v.as_bool())
.unwrap_or(false);
let domain = ctx.open_repo(project_name, repo_name).await?;
let commit_oid = if rev.len() >= 40 {
git::commit::types::CommitOid::new(&rev)
} else {
domain
.commit_get_prefix(&rev)
.map_err(|e| e.to_string())?
.oid
};
let entry = domain
.tree_entry_by_path_from_commit(&commit_oid, path)
.map_err(|e| e.to_string())?;
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
let data = &content.content;
if data.len() > MAX_FILE_SIZE {
return Err(format!(
"file too large ({} bytes), max {} bytes",
data.len(),
MAX_FILE_SIZE
));
}
let cursor = std::io::Cursor::new(data.clone());
let mut archive =
ZipArchive::new(cursor).map_err(|e| format!("failed to read PPTX ZIP: {}", e))?;
let mut slides: Vec<serde_json::Value> = Vec::new();
// Collect all slide file names
let mut slide_files: Vec<String> = (1..=1000)
.filter_map(|i| {
let name = format!("ppt/slides/slide{}.xml", i);
if archive.by_name(&name).is_ok() {
Some(name)
} else {
None
}
})
.collect();
let total_slides = slide_files.len();
let start = slide_start.unwrap_or(0).min(total_slides.saturating_sub(1));
let end = slide_end.unwrap_or(start + 50).min(total_slides);
for slide_file in slide_files.iter().skip(start).take(end - start) {
let slide_idx = slides.len() + start + 1;
let mut file = archive
.by_name(slide_file)
.map_err(|e| format!("failed to read slide {}: {}", slide_file, e))?;
let mut xml_content = String::new();
use std::io::Read;
file.read_to_string(&mut xml_content)
.map_err(|e| e.to_string())?;
// Extract text from slide XML
let text = extract_text_from_pptx_xml(&xml_content);
// Optionally extract notes
let notes = if include_notes {
let notes_file = format!("ppt/notesSlides/notesSlide{}.xml", slide_idx);
if let Ok(mut notes_file) = archive.by_name(&notes_file) {
let mut notes_xml = String::new();
if notes_file.read_to_string(&mut notes_xml).is_ok() {
Some(extract_text_from_pptx_xml(&notes_xml))
} else {
None
}
} else {
None
}
} else {
None
};
slides.push(serde_json::json!({
"slide": slide_idx,
"text": text.clone(),
"char_count": text.chars().count(),
"notes": notes,
}));
}
Ok(serde_json::json!({
"path": path,
"rev": rev,
"total_slides": total_slides,
"extracted_slides": slides.len(),
"slides": slides,
}))
}
/// Extract text content from PPTX slide XML using simple tag extraction.
fn extract_text_from_pptx_xml(xml: &str) -> String {
// PPTX uses <a:t> tags for text content
let mut results: Vec<&str> = Vec::new();
let mut last_end = 0;
while let Some(start) = xml[last_end..].find("<a:t") {
let abs_start = last_end + start;
if let Some(tag_end) = xml[abs_start..].find('>') {
let content_start = abs_start + tag_end + 1;
if let Some(end_tag) = xml[content_start..].find("</a:t>") {
let text = &xml[content_start..content_start + end_tag];
let trimmed = text.trim();
if !trimmed.is_empty() {
results.push(trimmed);
}
last_end = content_start + end_tag + 7; // len of </a:t>
} else {
break;
}
} else {
break;
}
}
// Also try <w:t> tags (notes slides use Word namespaces)
let mut last_end = 0;
while let Some(start) = xml[last_end..].find("<w:t") {
let abs_start = last_end + start;
if let Some(tag_end) = xml[abs_start..].find('>') {
let content_start = abs_start + tag_end + 1;
if let Some(end_tag) = xml[content_start..].find("</w:t>") {
let text = &xml[content_start..content_start + end_tag];
let trimmed = text.trim();
if !trimmed.is_empty() && !results.contains(&trimmed) {
results.push(trimmed);
}
last_end = content_start + end_tag + 6; // len of </w:t>
} else {
break;
}
} else {
break;
}
}
results.join(" ")
}
pub fn register_ppt_tools(registry: &mut ToolRegistry) {
let p = HashMap::from([
("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the .pptx document".into()), required: true, properties: None, items: None }),
("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }),
("slide_start".into(), ToolParam { name: "slide_start".into(), param_type: "integer".into(), description: Some("1-based starting slide number (default: 1)".into()), required: false, properties: None, items: None }),
("slide_end".into(), ToolParam { name: "slide_end".into(), param_type: "integer".into(), description: Some("1-based ending slide number".into()), required: false, properties: None, items: None }),
("include_notes".into(), ToolParam { name: "include_notes".into(), param_type: "boolean".into(), description: Some("Include speaker notes (default: false)".into()), required: false, properties: None, items: None }),
]);
let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
registry.register(
ToolDefinition::new("read_ppt")
.description("Extract text content from PowerPoint presentations (.pptx). Returns slide-by-slide text with character counts. Supports slide range selection and speaker notes.")
.parameters(schema),
ToolHandler::new(|ctx, args| {
let gctx = GitToolCtx::new(ctx);
Box::pin(async move {
read_ppt_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
})
}),
);
}