gitdataai/libs/service/file_tools/pdf.rs
ZhenYi 1af796ac75 feat(service): add file_tools module and git_blob_get tool
Add AI-accessible tools for reading structured files (CSV, JSON/JSONC,
Markdown, SQL) and searching repository content (git_grep). Also adds
git_blob_get to retrieve raw blob text content with binary detection.

Deferred: Excel, Word, PDF, PPT modules are stubbed out due to library
API incompatibilities (calamine 0.26, lopdf 0.34, quick-xml 0.37).
2026-04-18 23:02:10 +08:00

245 lines
8.9 KiB
Rust

//! read_pdf — extract text from PDF files.
use crate::file_tools::MAX_FILE_SIZE;
use crate::git_tools::ctx::GitToolCtx;
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
use futures::FutureExt;
use lopdf::{Document, Object, ObjectId};
use std::collections::HashMap;
/// Extract text content from a PDF page's content stream.
fn extract_page_text(doc: &Document, page_id: ObjectId) -> String {
let mut text = String::new();
// Get page dictionary
let page_dict = match doc.get(page_id) {
Ok(dict) => dict,
Err(_) => return text,
};
// Get content streams (can be a single stream or array)
let content_streams = match page_dict.get(b"Contents") {
Ok(obj) => obj.clone(),
Err(_) => return text,
};
let stream_ids: Vec<ObjectId> = match &content_streams {
Object::Reference(id) => vec![*id],
Object::Array(arr) => arr
.iter()
.filter_map(|o| {
if let Object::Reference(id) = o {
Some(*id)
} else {
None
}
})
.collect(),
_ => return text,
};
for stream_id in stream_ids {
if let Ok((_, stream)) = doc.get_stream(stream_id) {
// Decode the stream
if let Ok(decompressed) = stream.decompressed_content() {
text.push_str(&extract_text_from_content(&decompress_pdf_stream(&decompressed)));
text.push('\n');
}
}
}
text
}
/// Very simple PDF content stream text extraction.
/// Handles Tj, TJ, Td, T*, ', " operators.
fn extract_text_from_content(content: &[u8]) -> String {
let data = String::from_utf8_lossy(content);
let mut result = String::new();
let mut in_parens = false;
let mut current_text = String::new();
let mut last_was_tj = false;
let mut chars = data.chars().peekable();
while let Some(c) = chars.next() {
match c {
'(' => {
in_parens = true;
current_text.clear();
}
')' if in_parens => {
in_parens = false;
if !current_text.is_empty() {
if last_was_tj {
// TJ operator: subtract current text width offset
}
result.push_str(&current_text);
result.push(' ');
last_was_tj = false;
}
}
c if in_parens => {
if c == '\\' {
if let Some(escaped) = chars.next() {
match escaped {
'n' => current_text.push('\n'),
'r' => current_text.push('\r'),
't' => current_text.push('\t'),
_ => current_text.push(escaped),
}
}
} else {
current_text.push(c);
}
}
'%' => {
// Comment, skip to end of line
while let Some(nc) = chars.next() {
if nc == '\n' || nc == '\r' {
break;
}
}
}
_ => {}
}
}
// Clean up excessive newlines
let lines: Vec<&str> = result.lines().map(|l| l.trim()).filter(|l| !l.is_empty()).collect();
lines.join("\n")
}
fn decompress_pdf_stream(data: &[u8]) -> Vec<u8> {
// Try to detect and decompress flate/zlib streams
if data.len() < 2 {
return data.to_vec();
}
// Simple zlib check: zlib-wrapped deflate starts with 0x78
if data.starts_with(&[0x78]) || data.starts_with(&[0x08, 0x1b]) {
if let Ok(decoded) = flate2::read::ZlibDecoder::new(data).bytes().collect::<Result<Vec<_>, _>>() {
return decoded;
}
}
// Try raw deflate
if let Ok(decoded) = flate2::read::DeflateDecoder::new(data).bytes().collect::<Result<Vec<_>, _>>() {
return decoded;
}
data.to_vec()
}
async fn read_pdf_exec(
ctx: GitToolCtx,
args: serde_json::Value,
) -> Result<serde_json::Value, String> {
let p: serde_json::Map<String, serde_json::Value> =
serde_json::from_value(args).map_err(|e| e.to_string())?;
let project_name = p
.get("project_name")
.and_then(|v| v.as_str())
.ok_or("missing project_name")?;
let repo_name = p
.get("repo_name")
.and_then(|v| v.as_str())
.ok_or("missing repo_name")?;
let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
let rev = p
.get("rev")
.and_then(|v| v.as_str())
.map(String::from)
.unwrap_or_else(|| "HEAD".to_string());
let page_start = p.get("page_start").and_then(|v| v.as_u64()).map(|v| v as usize);
let page_end = p.get("page_end").and_then(|v| v.as_u64()).map(|v| v as usize);
let max_pages = p
.get("max_pages")
.and_then(|v| v.as_u64())
.unwrap_or(20) as usize;
let domain = ctx.open_repo(project_name, repo_name).await?;
let commit_oid = if rev.len() >= 40 {
git::commit::types::CommitOid::new(&rev)
} else {
domain
.commit_get_prefix(&rev)
.map_err(|e| e.to_string())?
.oid
};
let entry = domain
.tree_entry_by_path_from_commit(&commit_oid, path)
.map_err(|e| e.to_string())?;
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
let data = &content.content;
if data.len() > MAX_FILE_SIZE {
return Err(format!(
"file too large ({} bytes), max {} bytes",
data.len(),
MAX_FILE_SIZE
));
}
let doc = Document::load_from_mem(data)
.map_err(|e| format!("failed to parse PDF: {}", e))?;
// Get all page references
let pages: Vec<ObjectId> = doc
.pages
.values()
.cloned()
.collect();
let total_pages = pages.len();
let start = page_start.unwrap_or(0).min(total_pages.saturating_sub(1));
let end = page_end.unwrap_or(start + max_pages).min(total_pages);
let mut page_texts: Vec<serde_json::Value> = Vec::new();
for (i, page_id) in pages.iter().enumerate().skip(start).take(end - start) {
let text = extract_page_text(&doc, *page_id);
page_texts.push(serde_json::json!({
"page": i + 1,
"text": text,
"char_count": text.chars().count(),
}));
}
Ok(serde_json::json!({
"path": path,
"rev": rev,
"total_pages": total_pages,
"extracted_pages": page_texts.len(),
"pages": page_texts,
}))
}
pub fn register_pdf_tools(registry: &mut ToolRegistry) {
let p = HashMap::from([
("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the PDF document".into()), required: true, properties: None, items: None }),
("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }),
("page_start".into(), ToolParam { name: "page_start".into(), param_type: "integer".into(), description: Some("1-based starting page number (default: 1)".into()), required: false, properties: None, items: None }),
("page_end".into(), ToolParam { name: "page_end".into(), param_type: "integer".into(), description: Some("1-based ending page number (default: page_start + 20)".into()), required: false, properties: None, items: None }),
("max_pages".into(), ToolParam { name: "max_pages".into(), param_type: "integer".into(), description: Some("Maximum number of pages to extract (default: 20)".into()), required: false, properties: None, items: None }),
]);
let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
registry.register(
ToolDefinition::new("read_pdf")
.description("Extract text content from PDF files. Returns page-by-page text extraction with character counts. Supports page range selection.")
.parameters(schema),
ToolHandler::new(|ctx, args| {
let gctx = GitToolCtx::new(ctx);
Box::pin(async move {
read_pdf_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
})
}),
);
}