Add AI-accessible tools for reading structured files (CSV, JSON/JSONC, Markdown, SQL) and searching repository content (git_grep). Also adds git_blob_get to retrieve raw blob text content with binary detection. Deferred: Excel, Word, PDF, PPT modules are stubbed out due to library API incompatibilities (calamine 0.26, lopdf 0.34, quick-xml 0.37).
245 lines
8.9 KiB
Rust
245 lines
8.9 KiB
Rust
//! read_pdf — extract text from PDF files.
|
|
|
|
use crate::file_tools::MAX_FILE_SIZE;
|
|
use crate::git_tools::ctx::GitToolCtx;
|
|
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
|
|
use futures::FutureExt;
|
|
use lopdf::{Document, Object, ObjectId};
|
|
use std::collections::HashMap;
|
|
|
|
/// Extract text content from a PDF page's content stream.
|
|
fn extract_page_text(doc: &Document, page_id: ObjectId) -> String {
|
|
let mut text = String::new();
|
|
|
|
// Get page dictionary
|
|
let page_dict = match doc.get(page_id) {
|
|
Ok(dict) => dict,
|
|
Err(_) => return text,
|
|
};
|
|
|
|
// Get content streams (can be a single stream or array)
|
|
let content_streams = match page_dict.get(b"Contents") {
|
|
Ok(obj) => obj.clone(),
|
|
Err(_) => return text,
|
|
};
|
|
|
|
let stream_ids: Vec<ObjectId> = match &content_streams {
|
|
Object::Reference(id) => vec![*id],
|
|
Object::Array(arr) => arr
|
|
.iter()
|
|
.filter_map(|o| {
|
|
if let Object::Reference(id) = o {
|
|
Some(*id)
|
|
} else {
|
|
None
|
|
}
|
|
})
|
|
.collect(),
|
|
_ => return text,
|
|
};
|
|
|
|
for stream_id in stream_ids {
|
|
if let Ok((_, stream)) = doc.get_stream(stream_id) {
|
|
// Decode the stream
|
|
if let Ok(decompressed) = stream.decompressed_content() {
|
|
text.push_str(&extract_text_from_content(&decompress_pdf_stream(&decompressed)));
|
|
text.push('\n');
|
|
}
|
|
}
|
|
}
|
|
|
|
text
|
|
}
|
|
|
|
/// Very simple PDF content stream text extraction.
|
|
/// Handles Tj, TJ, Td, T*, ', " operators.
|
|
fn extract_text_from_content(content: &[u8]) -> String {
|
|
let data = String::from_utf8_lossy(content);
|
|
let mut result = String::new();
|
|
let mut in_parens = false;
|
|
let mut current_text = String::new();
|
|
let mut last_was_tj = false;
|
|
|
|
let mut chars = data.chars().peekable();
|
|
|
|
while let Some(c) = chars.next() {
|
|
match c {
|
|
'(' => {
|
|
in_parens = true;
|
|
current_text.clear();
|
|
}
|
|
')' if in_parens => {
|
|
in_parens = false;
|
|
if !current_text.is_empty() {
|
|
if last_was_tj {
|
|
// TJ operator: subtract current text width offset
|
|
}
|
|
result.push_str(¤t_text);
|
|
result.push(' ');
|
|
last_was_tj = false;
|
|
}
|
|
}
|
|
c if in_parens => {
|
|
if c == '\\' {
|
|
if let Some(escaped) = chars.next() {
|
|
match escaped {
|
|
'n' => current_text.push('\n'),
|
|
'r' => current_text.push('\r'),
|
|
't' => current_text.push('\t'),
|
|
_ => current_text.push(escaped),
|
|
}
|
|
}
|
|
} else {
|
|
current_text.push(c);
|
|
}
|
|
}
|
|
'%' => {
|
|
// Comment, skip to end of line
|
|
while let Some(nc) = chars.next() {
|
|
if nc == '\n' || nc == '\r' {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
|
|
// Clean up excessive newlines
|
|
let lines: Vec<&str> = result.lines().map(|l| l.trim()).filter(|l| !l.is_empty()).collect();
|
|
lines.join("\n")
|
|
}
|
|
|
|
fn decompress_pdf_stream(data: &[u8]) -> Vec<u8> {
|
|
// Try to detect and decompress flate/zlib streams
|
|
if data.len() < 2 {
|
|
return data.to_vec();
|
|
}
|
|
|
|
// Simple zlib check: zlib-wrapped deflate starts with 0x78
|
|
if data.starts_with(&[0x78]) || data.starts_with(&[0x08, 0x1b]) {
|
|
if let Ok(decoded) = flate2::read::ZlibDecoder::new(data).bytes().collect::<Result<Vec<_>, _>>() {
|
|
return decoded;
|
|
}
|
|
}
|
|
|
|
// Try raw deflate
|
|
if let Ok(decoded) = flate2::read::DeflateDecoder::new(data).bytes().collect::<Result<Vec<_>, _>>() {
|
|
return decoded;
|
|
}
|
|
|
|
data.to_vec()
|
|
}
|
|
|
|
async fn read_pdf_exec(
|
|
ctx: GitToolCtx,
|
|
args: serde_json::Value,
|
|
) -> Result<serde_json::Value, String> {
|
|
let p: serde_json::Map<String, serde_json::Value> =
|
|
serde_json::from_value(args).map_err(|e| e.to_string())?;
|
|
|
|
let project_name = p
|
|
.get("project_name")
|
|
.and_then(|v| v.as_str())
|
|
.ok_or("missing project_name")?;
|
|
let repo_name = p
|
|
.get("repo_name")
|
|
.and_then(|v| v.as_str())
|
|
.ok_or("missing repo_name")?;
|
|
let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
|
|
let rev = p
|
|
.get("rev")
|
|
.and_then(|v| v.as_str())
|
|
.map(String::from)
|
|
.unwrap_or_else(|| "HEAD".to_string());
|
|
let page_start = p.get("page_start").and_then(|v| v.as_u64()).map(|v| v as usize);
|
|
let page_end = p.get("page_end").and_then(|v| v.as_u64()).map(|v| v as usize);
|
|
let max_pages = p
|
|
.get("max_pages")
|
|
.and_then(|v| v.as_u64())
|
|
.unwrap_or(20) as usize;
|
|
|
|
let domain = ctx.open_repo(project_name, repo_name).await?;
|
|
|
|
let commit_oid = if rev.len() >= 40 {
|
|
git::commit::types::CommitOid::new(&rev)
|
|
} else {
|
|
domain
|
|
.commit_get_prefix(&rev)
|
|
.map_err(|e| e.to_string())?
|
|
.oid
|
|
};
|
|
|
|
let entry = domain
|
|
.tree_entry_by_path_from_commit(&commit_oid, path)
|
|
.map_err(|e| e.to_string())?;
|
|
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
|
|
|
|
let data = &content.content;
|
|
if data.len() > MAX_FILE_SIZE {
|
|
return Err(format!(
|
|
"file too large ({} bytes), max {} bytes",
|
|
data.len(),
|
|
MAX_FILE_SIZE
|
|
));
|
|
}
|
|
|
|
let doc = Document::load_from_mem(data)
|
|
.map_err(|e| format!("failed to parse PDF: {}", e))?;
|
|
|
|
// Get all page references
|
|
let pages: Vec<ObjectId> = doc
|
|
.pages
|
|
.values()
|
|
.cloned()
|
|
.collect();
|
|
|
|
let total_pages = pages.len();
|
|
|
|
let start = page_start.unwrap_or(0).min(total_pages.saturating_sub(1));
|
|
let end = page_end.unwrap_or(start + max_pages).min(total_pages);
|
|
|
|
let mut page_texts: Vec<serde_json::Value> = Vec::new();
|
|
|
|
for (i, page_id) in pages.iter().enumerate().skip(start).take(end - start) {
|
|
let text = extract_page_text(&doc, *page_id);
|
|
page_texts.push(serde_json::json!({
|
|
"page": i + 1,
|
|
"text": text,
|
|
"char_count": text.chars().count(),
|
|
}));
|
|
}
|
|
|
|
Ok(serde_json::json!({
|
|
"path": path,
|
|
"rev": rev,
|
|
"total_pages": total_pages,
|
|
"extracted_pages": page_texts.len(),
|
|
"pages": page_texts,
|
|
}))
|
|
}
|
|
|
|
pub fn register_pdf_tools(registry: &mut ToolRegistry) {
|
|
let p = HashMap::from([
|
|
("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
|
|
("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
|
|
("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the PDF document".into()), required: true, properties: None, items: None }),
|
|
("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }),
|
|
("page_start".into(), ToolParam { name: "page_start".into(), param_type: "integer".into(), description: Some("1-based starting page number (default: 1)".into()), required: false, properties: None, items: None }),
|
|
("page_end".into(), ToolParam { name: "page_end".into(), param_type: "integer".into(), description: Some("1-based ending page number (default: page_start + 20)".into()), required: false, properties: None, items: None }),
|
|
("max_pages".into(), ToolParam { name: "max_pages".into(), param_type: "integer".into(), description: Some("Maximum number of pages to extract (default: 20)".into()), required: false, properties: None, items: None }),
|
|
]);
|
|
let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
|
|
registry.register(
|
|
ToolDefinition::new("read_pdf")
|
|
.description("Extract text content from PDF files. Returns page-by-page text extraction with character counts. Supports page range selection.")
|
|
.parameters(schema),
|
|
ToolHandler::new(|ctx, args| {
|
|
let gctx = GitToolCtx::new(ctx);
|
|
Box::pin(async move {
|
|
read_pdf_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
|
|
})
|
|
}),
|
|
);
|
|
}
|