//! read_pdf — extract text from PDF files. use crate::file_tools::MAX_FILE_SIZE; use crate::git_tools::ctx::GitToolCtx; use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema}; use futures::FutureExt; use lopdf::{Document, Object, ObjectId}; use std::collections::HashMap; /// Extract text content from a PDF page's content stream. fn extract_page_text(doc: &Document, page_id: ObjectId) -> String { let mut text = String::new(); // Get page dictionary let page_dict = match doc.get(page_id) { Ok(dict) => dict, Err(_) => return text, }; // Get content streams (can be a single stream or array) let content_streams = match page_dict.get(b"Contents") { Ok(obj) => obj.clone(), Err(_) => return text, }; let stream_ids: Vec = match &content_streams { Object::Reference(id) => vec![*id], Object::Array(arr) => arr .iter() .filter_map(|o| { if let Object::Reference(id) = o { Some(*id) } else { None } }) .collect(), _ => return text, }; for stream_id in stream_ids { if let Ok((_, stream)) = doc.get_stream(stream_id) { // Decode the stream if let Ok(decompressed) = stream.decompressed_content() { text.push_str(&extract_text_from_content(&decompress_pdf_stream(&decompressed))); text.push('\n'); } } } text } /// Very simple PDF content stream text extraction. /// Handles Tj, TJ, Td, T*, ', " operators. fn extract_text_from_content(content: &[u8]) -> String { let data = String::from_utf8_lossy(content); let mut result = String::new(); let mut in_parens = false; let mut current_text = String::new(); let mut last_was_tj = false; let mut chars = data.chars().peekable(); while let Some(c) = chars.next() { match c { '(' => { in_parens = true; current_text.clear(); } ')' if in_parens => { in_parens = false; if !current_text.is_empty() { if last_was_tj { // TJ operator: subtract current text width offset } result.push_str(¤t_text); result.push(' '); last_was_tj = false; } } c if in_parens => { if c == '\\' { if let Some(escaped) = chars.next() { match escaped { 'n' => current_text.push('\n'), 'r' => current_text.push('\r'), 't' => current_text.push('\t'), _ => current_text.push(escaped), } } } else { current_text.push(c); } } '%' => { // Comment, skip to end of line while let Some(nc) = chars.next() { if nc == '\n' || nc == '\r' { break; } } } _ => {} } } // Clean up excessive newlines let lines: Vec<&str> = result.lines().map(|l| l.trim()).filter(|l| !l.is_empty()).collect(); lines.join("\n") } fn decompress_pdf_stream(data: &[u8]) -> Vec { // Try to detect and decompress flate/zlib streams if data.len() < 2 { return data.to_vec(); } // Simple zlib check: zlib-wrapped deflate starts with 0x78 if data.starts_with(&[0x78]) || data.starts_with(&[0x08, 0x1b]) { if let Ok(decoded) = flate2::read::ZlibDecoder::new(data).bytes().collect::, _>>() { return decoded; } } // Try raw deflate if let Ok(decoded) = flate2::read::DeflateDecoder::new(data).bytes().collect::, _>>() { return decoded; } data.to_vec() } async fn read_pdf_exec( ctx: GitToolCtx, args: serde_json::Value, ) -> Result { let p: serde_json::Map = serde_json::from_value(args).map_err(|e| e.to_string())?; let project_name = p .get("project_name") .and_then(|v| v.as_str()) .ok_or("missing project_name")?; let repo_name = p .get("repo_name") .and_then(|v| v.as_str()) .ok_or("missing repo_name")?; let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?; let rev = p .get("rev") .and_then(|v| v.as_str()) .map(String::from) .unwrap_or_else(|| "HEAD".to_string()); let page_start = p.get("page_start").and_then(|v| v.as_u64()).map(|v| v as usize); let page_end = p.get("page_end").and_then(|v| v.as_u64()).map(|v| v as usize); let max_pages = p .get("max_pages") .and_then(|v| v.as_u64()) .unwrap_or(20) as usize; let domain = ctx.open_repo(project_name, repo_name).await?; let commit_oid = if rev.len() >= 40 { git::commit::types::CommitOid::new(&rev) } else { domain .commit_get_prefix(&rev) .map_err(|e| e.to_string())? .oid }; let entry = domain .tree_entry_by_path_from_commit(&commit_oid, path) .map_err(|e| e.to_string())?; let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?; let data = &content.content; if data.len() > MAX_FILE_SIZE { return Err(format!( "file too large ({} bytes), max {} bytes", data.len(), MAX_FILE_SIZE )); } let doc = Document::load_from_mem(data) .map_err(|e| format!("failed to parse PDF: {}", e))?; // Get all page references let pages: Vec = doc .pages .values() .cloned() .collect(); let total_pages = pages.len(); let start = page_start.unwrap_or(0).min(total_pages.saturating_sub(1)); let end = page_end.unwrap_or(start + max_pages).min(total_pages); let mut page_texts: Vec = Vec::new(); for (i, page_id) in pages.iter().enumerate().skip(start).take(end - start) { let text = extract_page_text(&doc, *page_id); page_texts.push(serde_json::json!({ "page": i + 1, "text": text, "char_count": text.chars().count(), })); } Ok(serde_json::json!({ "path": path, "rev": rev, "total_pages": total_pages, "extracted_pages": page_texts.len(), "pages": page_texts, })) } pub fn register_pdf_tools(registry: &mut ToolRegistry) { let p = HashMap::from([ ("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }), ("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }), ("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the PDF document".into()), required: true, properties: None, items: None }), ("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }), ("page_start".into(), ToolParam { name: "page_start".into(), param_type: "integer".into(), description: Some("1-based starting page number (default: 1)".into()), required: false, properties: None, items: None }), ("page_end".into(), ToolParam { name: "page_end".into(), param_type: "integer".into(), description: Some("1-based ending page number (default: page_start + 20)".into()), required: false, properties: None, items: None }), ("max_pages".into(), ToolParam { name: "max_pages".into(), param_type: "integer".into(), description: Some("Maximum number of pages to extract (default: 20)".into()), required: false, properties: None, items: None }), ]); let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) }; registry.register( ToolDefinition::new("read_pdf") .description("Extract text content from PDF files. Returns page-by-page text extraction with character counts. Supports page range selection.") .parameters(schema), ToolHandler::new(|ctx, args| { let gctx = GitToolCtx::new(ctx); Box::pin(async move { read_pdf_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError) }) }), ); }