Add AI-accessible tools for reading structured files (CSV, JSON/JSONC, Markdown, SQL) and searching repository content (git_grep). Also adds git_blob_get to retrieve raw blob text content with binary detection. Deferred: Excel, Word, PDF, PPT modules are stubbed out due to library API incompatibilities (calamine 0.26, lopdf 0.34, quick-xml 0.37).
205 lines
8.1 KiB
Rust
205 lines
8.1 KiB
Rust
//! read_ppt — extract text from PowerPoint files (.pptx).
|
|
|
|
use crate::file_tools::MAX_FILE_SIZE;
|
|
use crate::git_tools::ctx::GitToolCtx;
|
|
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
|
|
use futures::FutureExt;
|
|
use std::collections::HashMap;
|
|
use zip::ZipArchive;
|
|
|
|
async fn read_ppt_exec(
|
|
ctx: GitToolCtx,
|
|
args: serde_json::Value,
|
|
) -> Result<serde_json::Value, String> {
|
|
let p: serde_json::Map<String, serde_json::Value> =
|
|
serde_json::from_value(args).map_err(|e| e.to_string())?;
|
|
|
|
let project_name = p
|
|
.get("project_name")
|
|
.and_then(|v| v.as_str())
|
|
.ok_or("missing project_name")?;
|
|
let repo_name = p
|
|
.get("repo_name")
|
|
.and_then(|v| v.as_str())
|
|
.ok_or("missing repo_name")?;
|
|
let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
|
|
let rev = p
|
|
.get("rev")
|
|
.and_then(|v| v.as_str())
|
|
.map(String::from)
|
|
.unwrap_or_else(|| "HEAD".to_string());
|
|
let slide_start = p.get("slide_start").and_then(|v| v.as_u64()).map(|v| v as usize);
|
|
let slide_end = p.get("slide_end").and_then(|v| v.as_u64()).map(|v| v as usize);
|
|
let include_notes = p
|
|
.get("include_notes")
|
|
.and_then(|v| v.as_bool())
|
|
.unwrap_or(false);
|
|
|
|
let domain = ctx.open_repo(project_name, repo_name).await?;
|
|
|
|
let commit_oid = if rev.len() >= 40 {
|
|
git::commit::types::CommitOid::new(&rev)
|
|
} else {
|
|
domain
|
|
.commit_get_prefix(&rev)
|
|
.map_err(|e| e.to_string())?
|
|
.oid
|
|
};
|
|
|
|
let entry = domain
|
|
.tree_entry_by_path_from_commit(&commit_oid, path)
|
|
.map_err(|e| e.to_string())?;
|
|
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
|
|
|
|
let data = &content.content;
|
|
if data.len() > MAX_FILE_SIZE {
|
|
return Err(format!(
|
|
"file too large ({} bytes), max {} bytes",
|
|
data.len(),
|
|
MAX_FILE_SIZE
|
|
));
|
|
}
|
|
|
|
let cursor = std::io::Cursor::new(data.clone());
|
|
let mut archive =
|
|
ZipArchive::new(cursor).map_err(|e| format!("failed to read PPTX ZIP: {}", e))?;
|
|
|
|
let mut slides: Vec<serde_json::Value> = Vec::new();
|
|
|
|
// Collect all slide file names
|
|
let mut slide_files: Vec<String> = (1..=1000)
|
|
.filter_map(|i| {
|
|
let name = format!("ppt/slides/slide{}.xml", i);
|
|
if archive.by_name(&name).is_ok() {
|
|
Some(name)
|
|
} else {
|
|
None
|
|
}
|
|
})
|
|
.collect();
|
|
|
|
let total_slides = slide_files.len();
|
|
let start = slide_start.unwrap_or(0).min(total_slides.saturating_sub(1));
|
|
let end = slide_end.unwrap_or(start + 50).min(total_slides);
|
|
|
|
for slide_file in slide_files.iter().skip(start).take(end - start) {
|
|
let slide_idx = slides.len() + start + 1;
|
|
|
|
let mut file = archive
|
|
.by_name(slide_file)
|
|
.map_err(|e| format!("failed to read slide {}: {}", slide_file, e))?;
|
|
let mut xml_content = String::new();
|
|
use std::io::Read;
|
|
file.read_to_string(&mut xml_content)
|
|
.map_err(|e| e.to_string())?;
|
|
|
|
// Extract text from slide XML
|
|
let text = extract_text_from_pptx_xml(&xml_content);
|
|
|
|
// Optionally extract notes
|
|
let notes = if include_notes {
|
|
let notes_file = format!("ppt/notesSlides/notesSlide{}.xml", slide_idx);
|
|
if let Ok(mut notes_file) = archive.by_name(¬es_file) {
|
|
let mut notes_xml = String::new();
|
|
if notes_file.read_to_string(&mut notes_xml).is_ok() {
|
|
Some(extract_text_from_pptx_xml(¬es_xml))
|
|
} else {
|
|
None
|
|
}
|
|
} else {
|
|
None
|
|
}
|
|
} else {
|
|
None
|
|
};
|
|
|
|
slides.push(serde_json::json!({
|
|
"slide": slide_idx,
|
|
"text": text.clone(),
|
|
"char_count": text.chars().count(),
|
|
"notes": notes,
|
|
}));
|
|
}
|
|
|
|
Ok(serde_json::json!({
|
|
"path": path,
|
|
"rev": rev,
|
|
"total_slides": total_slides,
|
|
"extracted_slides": slides.len(),
|
|
"slides": slides,
|
|
}))
|
|
}
|
|
|
|
/// Extract text content from PPTX slide XML using simple tag extraction.
|
|
fn extract_text_from_pptx_xml(xml: &str) -> String {
|
|
// PPTX uses <a:t> tags for text content
|
|
let mut results: Vec<&str> = Vec::new();
|
|
let mut last_end = 0;
|
|
|
|
while let Some(start) = xml[last_end..].find("<a:t") {
|
|
let abs_start = last_end + start;
|
|
if let Some(tag_end) = xml[abs_start..].find('>') {
|
|
let content_start = abs_start + tag_end + 1;
|
|
if let Some(end_tag) = xml[content_start..].find("</a:t>") {
|
|
let text = &xml[content_start..content_start + end_tag];
|
|
let trimmed = text.trim();
|
|
if !trimmed.is_empty() {
|
|
results.push(trimmed);
|
|
}
|
|
last_end = content_start + end_tag + 7; // len of </a:t>
|
|
} else {
|
|
break;
|
|
}
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Also try <w:t> tags (notes slides use Word namespaces)
|
|
let mut last_end = 0;
|
|
while let Some(start) = xml[last_end..].find("<w:t") {
|
|
let abs_start = last_end + start;
|
|
if let Some(tag_end) = xml[abs_start..].find('>') {
|
|
let content_start = abs_start + tag_end + 1;
|
|
if let Some(end_tag) = xml[content_start..].find("</w:t>") {
|
|
let text = &xml[content_start..content_start + end_tag];
|
|
let trimmed = text.trim();
|
|
if !trimmed.is_empty() && !results.contains(&trimmed) {
|
|
results.push(trimmed);
|
|
}
|
|
last_end = content_start + end_tag + 6; // len of </w:t>
|
|
} else {
|
|
break;
|
|
}
|
|
} else {
|
|
break;
|
|
}
|
|
}
|
|
|
|
results.join(" ")
|
|
}
|
|
|
|
pub fn register_ppt_tools(registry: &mut ToolRegistry) {
|
|
let p = HashMap::from([
|
|
("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
|
|
("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
|
|
("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the .pptx document".into()), required: true, properties: None, items: None }),
|
|
("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }),
|
|
("slide_start".into(), ToolParam { name: "slide_start".into(), param_type: "integer".into(), description: Some("1-based starting slide number (default: 1)".into()), required: false, properties: None, items: None }),
|
|
("slide_end".into(), ToolParam { name: "slide_end".into(), param_type: "integer".into(), description: Some("1-based ending slide number".into()), required: false, properties: None, items: None }),
|
|
("include_notes".into(), ToolParam { name: "include_notes".into(), param_type: "boolean".into(), description: Some("Include speaker notes (default: false)".into()), required: false, properties: None, items: None }),
|
|
]);
|
|
let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
|
|
registry.register(
|
|
ToolDefinition::new("read_ppt")
|
|
.description("Extract text content from PowerPoint presentations (.pptx). Returns slide-by-slide text with character counts. Supports slide range selection and speaker notes.")
|
|
.parameters(schema),
|
|
ToolHandler::new(|ctx, args| {
|
|
let gctx = GitToolCtx::new(ctx);
|
|
Box::pin(async move {
|
|
read_ppt_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
|
|
})
|
|
}),
|
|
);
|
|
}
|