gitdataai/libs/service/file_tools/pdf.rs

//! read_pdf — extract text from PDF files.

use crate::file_tools::MAX_FILE_SIZE;
use crate::git_tools::ctx::GitToolCtx;
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
use futures::FutureExt;
use lopdf::{Document, Object, ObjectId};
use std::collections::HashMap;

/// Extract text content from a PDF page's content stream.
fn extract_page_text(doc: &Document, page_id: ObjectId) -> String {
    let mut text = String::new();

    // Get page dictionary
    let page_dict = match doc.get(page_id) {
        Ok(dict) => dict,
        Err(_) => return text,
    };

    // Get content streams (can be a single stream or array)
    let content_streams = match page_dict.get(b"Contents") {
        Ok(obj) => obj.clone(),
        Err(_) => return text,
    };

    let stream_ids: Vec<ObjectId> = match &content_streams {
        Object::Reference(id) => vec![*id],
        Object::Array(arr) => arr
            .iter()
            .filter_map(|o| {
                if let Object::Reference(id) = o {
                    Some(*id)
                } else {
                    None
                }
            })
            .collect(),
        _ => return text,
    };

    for stream_id in stream_ids {
        if let Ok((_, stream)) = doc.get_stream(stream_id) {
            // Decode the stream
            if let Ok(decompressed) = stream.decompressed_content() {
                text.push_str(&extract_text_from_content(&decompress_pdf_stream(&decompressed)));
                text.push('\n');
            }
        }
    }

    text
}

/// Very simple PDF content stream text extraction.
/// Handles Tj, TJ, Td, T*, ', " operators.
fn extract_text_from_content(content: &[u8]) -> String {
    let data = String::from_utf8_lossy(content);
    let mut result = String::new();
    let mut in_parens = false;
    let mut current_text = String::new();
    let mut last_was_tj = false;

    let mut chars = data.chars().peekable();

    while let Some(c) = chars.next() {
        match c {
            '(' => {
                in_parens = true;
                current_text.clear();
            }
            ')' if in_parens => {
                in_parens = false;
                if !current_text.is_empty() {
                    if last_was_tj {
                        // TJ operator: subtract current text width offset
                    }
                    result.push_str(&current_text);
                    result.push(' ');
                    last_was_tj = false;
                }
            }
            c if in_parens => {
                if c == '\\' {
                    if let Some(escaped) = chars.next() {
                        match escaped {
                            'n' => current_text.push('\n'),
                            'r' => current_text.push('\r'),
                            't' => current_text.push('\t'),
                            _ => current_text.push(escaped),
                        }
                    }
                } else {
                    current_text.push(c);
                }
            }
            '%' => {
                // Comment, skip to end of line
                while let Some(nc) = chars.next() {
                    if nc == '\n' || nc == '\r' {
                        break;
                    }
                }
            }
            _ => {}
        }
    }

    // Clean up excessive newlines
    let lines: Vec<&str> = result.lines().map(|l| l.trim()).filter(|l| !l.is_empty()).collect();
    lines.join("\n")
}

fn decompress_pdf_stream(data: &[u8]) -> Vec<u8> {
    // Try to detect and decompress flate/zlib streams
    if data.len() < 2 {
        return data.to_vec();
    }

    // Simple zlib check: zlib-wrapped deflate starts with 0x78
    if data.starts_with(&[0x78]) || data.starts_with(&[0x08, 0x1b]) {
        if let Ok(decoded) = flate2::read::ZlibDecoder::new(data).bytes().collect::<Result<Vec<_>, _>>() {
            return decoded;
        }
    }

    // Try raw deflate
    if let Ok(decoded) = flate2::read::DeflateDecoder::new(data).bytes().collect::<Result<Vec<_>, _>>() {
        return decoded;
    }

    data.to_vec()
}

async fn read_pdf_exec(
    ctx: GitToolCtx,
    args: serde_json::Value,
) -> Result<serde_json::Value, String> {
    let p: serde_json::Map<String, serde_json::Value> =
        serde_json::from_value(args).map_err(|e| e.to_string())?;

    let project_name = p
        .get("project_name")
        .and_then(|v| v.as_str())
        .ok_or("missing project_name")?;
    let repo_name = p
        .get("repo_name")
        .and_then(|v| v.as_str())
        .ok_or("missing repo_name")?;
    let path = p.get("path").and_then(|v| v.as_str()).ok_or("missing path")?;
    let rev = p
        .get("rev")
        .and_then(|v| v.as_str())
        .map(String::from)
        .unwrap_or_else(|| "HEAD".to_string());
    let page_start = p.get("page_start").and_then(|v| v.as_u64()).map(|v| v as usize);
    let page_end = p.get("page_end").and_then(|v| v.as_u64()).map(|v| v as usize);
    let max_pages = p
        .get("max_pages")
        .and_then(|v| v.as_u64())
        .unwrap_or(20) as usize;

    let domain = ctx.open_repo(project_name, repo_name).await?;

    let commit_oid = if rev.len() >= 40 {
        git::commit::types::CommitOid::new(&rev)
    } else {
        domain
            .commit_get_prefix(&rev)
            .map_err(|e| e.to_string())?
            .oid
    };

    let entry = domain
        .tree_entry_by_path_from_commit(&commit_oid, path)
        .map_err(|e| e.to_string())?;
    let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;

    let data = &content.content;
    if data.len() > MAX_FILE_SIZE {
        return Err(format!(
            "file too large ({} bytes), max {} bytes",
            data.len(),
            MAX_FILE_SIZE
        ));
    }

    let doc = Document::load_from_mem(data)
        .map_err(|e| format!("failed to parse PDF: {}", e))?;

    // Get all page references
    let pages: Vec<ObjectId> = doc
        .pages
        .values()
        .cloned()
        .collect();

    let total_pages = pages.len();

    let start = page_start.unwrap_or(0).min(total_pages.saturating_sub(1));
    let end = page_end.unwrap_or(start + max_pages).min(total_pages);

    let mut page_texts: Vec<serde_json::Value> = Vec::new();

    for (i, page_id) in pages.iter().enumerate().skip(start).take(end - start) {
        let text = extract_page_text(&doc, *page_id);
        page_texts.push(serde_json::json!({
            "page": i + 1,
            "text": text,
            "char_count": text.chars().count(),
        }));
    }

    Ok(serde_json::json!({
        "path": path,
        "rev": rev,
        "total_pages": total_pages,
        "extracted_pages": page_texts.len(),
        "pages": page_texts,
    }))
}

pub fn register_pdf_tools(registry: &mut ToolRegistry) {
    let p = HashMap::from([
        ("project_name".into(), ToolParam { name: "project_name".into(), param_type: "string".into(), description: Some("Project name (slug)".into()), required: true, properties: None, items: None }),
        ("repo_name".into(), ToolParam { name: "repo_name".into(), param_type: "string".into(), description: Some("Repository name".into()), required: true, properties: None, items: None }),
        ("path".into(), ToolParam { name: "path".into(), param_type: "string".into(), description: Some("File path to the PDF document".into()), required: true, properties: None, items: None }),
        ("rev".into(), ToolParam { name: "rev".into(), param_type: "string".into(), description: Some("Git revision (default: HEAD)".into()), required: false, properties: None, items: None }),
        ("page_start".into(), ToolParam { name: "page_start".into(), param_type: "integer".into(), description: Some("1-based starting page number (default: 1)".into()), required: false, properties: None, items: None }),
        ("page_end".into(), ToolParam { name: "page_end".into(), param_type: "integer".into(), description: Some("1-based ending page number (default: page_start + 20)".into()), required: false, properties: None, items: None }),
        ("max_pages".into(), ToolParam { name: "max_pages".into(), param_type: "integer".into(), description: Some("Maximum number of pages to extract (default: 20)".into()), required: false, properties: None, items: None }),
    ]);
    let schema = ToolSchema { schema_type: "object".into(), properties: Some(p), required: Some(vec!["project_name".into(), "repo_name".into(), "path".into()]) };
    registry.register(
        ToolDefinition::new("read_pdf")
            .description("Extract text content from PDF files. Returns page-by-page text extraction with character counts. Supports page range selection.")
            .parameters(schema),
        ToolHandler::new(|ctx, args| {
            let gctx = GitToolCtx::new(ctx);
            Box::pin(async move {
                read_pdf_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
            })
        }),
    );
}