gitdataai/libs/service/file_tools/markdown.rs
ZhenYi 1af796ac75 feat(service): add file_tools module and git_blob_get tool
Add AI-accessible tools for reading structured files (CSV, JSON/JSONC,
Markdown, SQL) and searching repository content (git_grep). Also adds
git_blob_get to retrieve raw blob text content with binary detection.

Deferred: Excel, Word, PDF, PPT modules are stubbed out due to library
API incompatibilities (calamine 0.26, lopdf 0.34, quick-xml 0.37).
2026-04-18 23:02:10 +08:00

287 lines
9.5 KiB
Rust

//! read_markdown — parse and analyze Markdown files.
use crate::file_tools::MAX_FILE_SIZE;
use crate::git_tools::ctx::GitToolCtx;
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Parser, Tag, TagEnd};
use std::collections::HashMap;
async fn read_markdown_exec(
ctx: GitToolCtx,
args: serde_json::Value,
) -> Result<serde_json::Value, String> {
let p: serde_json::Map<String, serde_json::Value> =
serde_json::from_value(args).map_err(|e| e.to_string())?;
let project_name = p
.get("project_name")
.and_then(|v| v.as_str())
.ok_or("missing project_name")?;
let repo_name = p
.get("repo_name")
.and_then(|v| v.as_str())
.ok_or("missing repo_name")?;
let path = p
.get("path")
.and_then(|v| v.as_str())
.ok_or("missing path")?;
let rev = p
.get("rev")
.and_then(|v| v.as_str())
.map(String::from)
.unwrap_or_else(|| "HEAD".to_string());
let include_code = p
.get("include_code")
.and_then(|v| v.as_bool())
.unwrap_or(true);
let sections_only = p
.get("sections_only")
.and_then(|v| v.as_bool())
.unwrap_or(false);
let domain = ctx.open_repo(project_name, repo_name).await?;
let commit_oid = if rev.len() >= 40 {
git::commit::types::CommitOid::new(&rev)
} else {
domain
.commit_get_prefix(&rev)
.map_err(|e| e.to_string())?
.oid
};
let entry = domain
.tree_entry_by_path_from_commit(&commit_oid, path)
.map_err(|e| e.to_string())?;
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
let data = &content.content;
if data.len() > MAX_FILE_SIZE {
return Err(format!(
"file too large ({} bytes), max {} bytes",
data.len(),
MAX_FILE_SIZE
));
}
let text = String::from_utf8_lossy(data);
let parser = Parser::new(&text);
let mut sections: Vec<serde_json::Value> = Vec::new();
let mut code_blocks: Vec<serde_json::Value> = Vec::new();
let mut links: Vec<serde_json::Value> = Vec::new();
let mut images: Vec<serde_json::Value> = Vec::new();
let mut current_heading_level: Option<u32> = None;
let mut current_heading_text = String::new();
let mut in_code_block = false;
let mut code_block_lang = String::new();
let mut code_block_content = String::new();
let mut toc: Vec<serde_json::Value> = Vec::new();
for event in parser {
match event {
Event::Start(Tag::Heading { level, .. }) => {
current_heading_level = Some(match level {
HeadingLevel::H1 => 1,
HeadingLevel::H2 => 2,
HeadingLevel::H3 => 3,
HeadingLevel::H4 => 4,
HeadingLevel::H5 => 5,
HeadingLevel::H6 => 6,
});
current_heading_text.clear();
}
Event::End(TagEnd::Heading(level)) => {
let lvl = match level {
HeadingLevel::H1 => 1,
HeadingLevel::H2 => 2,
HeadingLevel::H3 => 3,
HeadingLevel::H4 => 4,
HeadingLevel::H5 => 5,
HeadingLevel::H6 => 6,
};
let heading = current_heading_text.trim().to_string();
if !heading.is_empty() {
let section = serde_json::json!({
"level": lvl,
"title": heading,
});
toc.push(section.clone());
if !sections_only {
sections.push(serde_json::json!({
"level": lvl,
"title": heading,
"content": "",
}));
}
}
current_heading_level = None;
}
Event::Text(text) => {
if in_code_block {
code_block_content.push_str(&text);
code_block_content.push('\n');
} else if let Some(_) = current_heading_level {
current_heading_text.push_str(&text);
current_heading_text.push(' ');
}
}
Event::Code(code) => {
code_blocks.push(serde_json::json!({
"language": "",
"code": code.as_ref(),
}));
}
Event::Start(Tag::CodeBlock(kind)) => {
in_code_block = true;
code_block_content.clear();
code_block_lang = match kind {
CodeBlockKind::Fenced(info) => info.as_ref().to_string(),
CodeBlockKind::Indented => String::new(),
};
}
Event::End(TagEnd::CodeBlock) => {
in_code_block = false;
if include_code {
code_blocks.push(serde_json::json!({
"language": code_block_lang,
"code": code_block_content.trim().to_string(),
}));
}
code_block_lang.clear();
}
Event::Start(Tag::Link { dest_url, .. }) => {
links.push(serde_json::json!({ "url": dest_url.to_string() }));
}
Event::Start(Tag::Image { dest_url, .. }) => {
images.push(serde_json::json!({ "url": dest_url.to_string() }));
}
_ => {}
}
}
// Build outline (h1/h2/h3 only)
let outline: Vec<serde_json::Value> = toc
.iter()
.filter(|s| {
let lvl = s.get("level").and_then(|v| v.as_u64()).unwrap_or(0) as u32;
lvl <= 3
})
.cloned()
.collect();
Ok(serde_json::json!({
"path": path,
"rev": rev,
"stats": {
"chars": text.chars().count(),
"words": text.split_whitespace().count(),
"lines": text.lines().count(),
"headings": toc.len(),
"code_blocks": code_blocks.len(),
"links": links.len(),
"images": images.len(),
},
"outline": outline,
"headings": toc,
"code_blocks": if include_code { code_blocks } else { vec![] },
"links": links,
"images": images,
}))
}
pub fn register_markdown_tools(registry: &mut ToolRegistry) {
let p = HashMap::from([
(
"project_name".into(),
ToolParam {
name: "project_name".into(),
param_type: "string".into(),
description: Some("Project name (slug)".into()),
required: true,
properties: None,
items: None,
},
),
(
"repo_name".into(),
ToolParam {
name: "repo_name".into(),
param_type: "string".into(),
description: Some("Repository name".into()),
required: true,
properties: None,
items: None,
},
),
(
"path".into(),
ToolParam {
name: "path".into(),
param_type: "string".into(),
description: Some("File path to the Markdown file".into()),
required: true,
properties: None,
items: None,
},
),
(
"rev".into(),
ToolParam {
name: "rev".into(),
param_type: "string".into(),
description: Some("Git revision (default: HEAD)".into()),
required: false,
properties: None,
items: None,
},
),
(
"sections_only".into(),
ToolParam {
name: "sections_only".into(),
param_type: "boolean".into(),
description: Some(
"If true, return only section headings (outline). Default: false".into(),
),
required: false,
properties: None,
items: None,
},
),
(
"include_code".into(),
ToolParam {
name: "include_code".into(),
param_type: "boolean".into(),
description: Some("Include code blocks in result. Default: true".into()),
required: false,
properties: None,
items: None,
},
),
]);
let schema = ToolSchema {
schema_type: "object".into(),
properties: Some(p),
required: Some(vec![
"project_name".into(),
"repo_name".into(),
"path".into(),
]),
};
registry.register(
ToolDefinition::new("read_markdown")
.description("Parse and analyze a Markdown file. Returns document statistics, heading outline, code blocks with languages, links, and images.")
.parameters(schema),
ToolHandler::new(|ctx, args| {
let gctx = GitToolCtx::new(ctx);
Box::pin(async move {
read_markdown_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
})
}),
);
}