Add AI-accessible tools for reading structured files (CSV, JSON/JSONC, Markdown, SQL) and searching repository content (git_grep). Also adds git_blob_get to retrieve raw blob text content with binary detection. Deferred: Excel, Word, PDF, PPT modules are stubbed out due to library API incompatibilities (calamine 0.26, lopdf 0.34, quick-xml 0.37).
287 lines
9.5 KiB
Rust
287 lines
9.5 KiB
Rust
//! read_markdown — parse and analyze Markdown files.
|
|
|
|
use crate::file_tools::MAX_FILE_SIZE;
|
|
use crate::git_tools::ctx::GitToolCtx;
|
|
use agent::{ToolDefinition, ToolHandler, ToolParam, ToolRegistry, ToolSchema};
|
|
use pulldown_cmark::{CodeBlockKind, Event, HeadingLevel, Parser, Tag, TagEnd};
|
|
use std::collections::HashMap;
|
|
|
|
async fn read_markdown_exec(
|
|
ctx: GitToolCtx,
|
|
args: serde_json::Value,
|
|
) -> Result<serde_json::Value, String> {
|
|
let p: serde_json::Map<String, serde_json::Value> =
|
|
serde_json::from_value(args).map_err(|e| e.to_string())?;
|
|
|
|
let project_name = p
|
|
.get("project_name")
|
|
.and_then(|v| v.as_str())
|
|
.ok_or("missing project_name")?;
|
|
let repo_name = p
|
|
.get("repo_name")
|
|
.and_then(|v| v.as_str())
|
|
.ok_or("missing repo_name")?;
|
|
let path = p
|
|
.get("path")
|
|
.and_then(|v| v.as_str())
|
|
.ok_or("missing path")?;
|
|
let rev = p
|
|
.get("rev")
|
|
.and_then(|v| v.as_str())
|
|
.map(String::from)
|
|
.unwrap_or_else(|| "HEAD".to_string());
|
|
let include_code = p
|
|
.get("include_code")
|
|
.and_then(|v| v.as_bool())
|
|
.unwrap_or(true);
|
|
let sections_only = p
|
|
.get("sections_only")
|
|
.and_then(|v| v.as_bool())
|
|
.unwrap_or(false);
|
|
|
|
let domain = ctx.open_repo(project_name, repo_name).await?;
|
|
|
|
let commit_oid = if rev.len() >= 40 {
|
|
git::commit::types::CommitOid::new(&rev)
|
|
} else {
|
|
domain
|
|
.commit_get_prefix(&rev)
|
|
.map_err(|e| e.to_string())?
|
|
.oid
|
|
};
|
|
|
|
let entry = domain
|
|
.tree_entry_by_path_from_commit(&commit_oid, path)
|
|
.map_err(|e| e.to_string())?;
|
|
let content = domain.blob_content(&entry.oid).map_err(|e| e.to_string())?;
|
|
|
|
let data = &content.content;
|
|
if data.len() > MAX_FILE_SIZE {
|
|
return Err(format!(
|
|
"file too large ({} bytes), max {} bytes",
|
|
data.len(),
|
|
MAX_FILE_SIZE
|
|
));
|
|
}
|
|
|
|
let text = String::from_utf8_lossy(data);
|
|
let parser = Parser::new(&text);
|
|
|
|
let mut sections: Vec<serde_json::Value> = Vec::new();
|
|
let mut code_blocks: Vec<serde_json::Value> = Vec::new();
|
|
let mut links: Vec<serde_json::Value> = Vec::new();
|
|
let mut images: Vec<serde_json::Value> = Vec::new();
|
|
|
|
let mut current_heading_level: Option<u32> = None;
|
|
let mut current_heading_text = String::new();
|
|
let mut in_code_block = false;
|
|
let mut code_block_lang = String::new();
|
|
let mut code_block_content = String::new();
|
|
|
|
let mut toc: Vec<serde_json::Value> = Vec::new();
|
|
|
|
for event in parser {
|
|
match event {
|
|
Event::Start(Tag::Heading { level, .. }) => {
|
|
current_heading_level = Some(match level {
|
|
HeadingLevel::H1 => 1,
|
|
HeadingLevel::H2 => 2,
|
|
HeadingLevel::H3 => 3,
|
|
HeadingLevel::H4 => 4,
|
|
HeadingLevel::H5 => 5,
|
|
HeadingLevel::H6 => 6,
|
|
});
|
|
current_heading_text.clear();
|
|
}
|
|
Event::End(TagEnd::Heading(level)) => {
|
|
let lvl = match level {
|
|
HeadingLevel::H1 => 1,
|
|
HeadingLevel::H2 => 2,
|
|
HeadingLevel::H3 => 3,
|
|
HeadingLevel::H4 => 4,
|
|
HeadingLevel::H5 => 5,
|
|
HeadingLevel::H6 => 6,
|
|
};
|
|
let heading = current_heading_text.trim().to_string();
|
|
if !heading.is_empty() {
|
|
let section = serde_json::json!({
|
|
"level": lvl,
|
|
"title": heading,
|
|
});
|
|
toc.push(section.clone());
|
|
if !sections_only {
|
|
sections.push(serde_json::json!({
|
|
"level": lvl,
|
|
"title": heading,
|
|
"content": "",
|
|
}));
|
|
}
|
|
}
|
|
current_heading_level = None;
|
|
}
|
|
Event::Text(text) => {
|
|
if in_code_block {
|
|
code_block_content.push_str(&text);
|
|
code_block_content.push('\n');
|
|
} else if let Some(_) = current_heading_level {
|
|
current_heading_text.push_str(&text);
|
|
current_heading_text.push(' ');
|
|
}
|
|
}
|
|
Event::Code(code) => {
|
|
code_blocks.push(serde_json::json!({
|
|
"language": "",
|
|
"code": code.as_ref(),
|
|
}));
|
|
}
|
|
Event::Start(Tag::CodeBlock(kind)) => {
|
|
in_code_block = true;
|
|
code_block_content.clear();
|
|
code_block_lang = match kind {
|
|
CodeBlockKind::Fenced(info) => info.as_ref().to_string(),
|
|
CodeBlockKind::Indented => String::new(),
|
|
};
|
|
}
|
|
Event::End(TagEnd::CodeBlock) => {
|
|
in_code_block = false;
|
|
if include_code {
|
|
code_blocks.push(serde_json::json!({
|
|
"language": code_block_lang,
|
|
"code": code_block_content.trim().to_string(),
|
|
}));
|
|
}
|
|
code_block_lang.clear();
|
|
}
|
|
Event::Start(Tag::Link { dest_url, .. }) => {
|
|
links.push(serde_json::json!({ "url": dest_url.to_string() }));
|
|
}
|
|
Event::Start(Tag::Image { dest_url, .. }) => {
|
|
images.push(serde_json::json!({ "url": dest_url.to_string() }));
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
|
|
// Build outline (h1/h2/h3 only)
|
|
let outline: Vec<serde_json::Value> = toc
|
|
.iter()
|
|
.filter(|s| {
|
|
let lvl = s.get("level").and_then(|v| v.as_u64()).unwrap_or(0) as u32;
|
|
lvl <= 3
|
|
})
|
|
.cloned()
|
|
.collect();
|
|
|
|
Ok(serde_json::json!({
|
|
"path": path,
|
|
"rev": rev,
|
|
"stats": {
|
|
"chars": text.chars().count(),
|
|
"words": text.split_whitespace().count(),
|
|
"lines": text.lines().count(),
|
|
"headings": toc.len(),
|
|
"code_blocks": code_blocks.len(),
|
|
"links": links.len(),
|
|
"images": images.len(),
|
|
},
|
|
"outline": outline,
|
|
"headings": toc,
|
|
"code_blocks": if include_code { code_blocks } else { vec![] },
|
|
"links": links,
|
|
"images": images,
|
|
}))
|
|
}
|
|
|
|
pub fn register_markdown_tools(registry: &mut ToolRegistry) {
|
|
let p = HashMap::from([
|
|
(
|
|
"project_name".into(),
|
|
ToolParam {
|
|
name: "project_name".into(),
|
|
param_type: "string".into(),
|
|
description: Some("Project name (slug)".into()),
|
|
required: true,
|
|
properties: None,
|
|
items: None,
|
|
},
|
|
),
|
|
(
|
|
"repo_name".into(),
|
|
ToolParam {
|
|
name: "repo_name".into(),
|
|
param_type: "string".into(),
|
|
description: Some("Repository name".into()),
|
|
required: true,
|
|
properties: None,
|
|
items: None,
|
|
},
|
|
),
|
|
(
|
|
"path".into(),
|
|
ToolParam {
|
|
name: "path".into(),
|
|
param_type: "string".into(),
|
|
description: Some("File path to the Markdown file".into()),
|
|
required: true,
|
|
properties: None,
|
|
items: None,
|
|
},
|
|
),
|
|
(
|
|
"rev".into(),
|
|
ToolParam {
|
|
name: "rev".into(),
|
|
param_type: "string".into(),
|
|
description: Some("Git revision (default: HEAD)".into()),
|
|
required: false,
|
|
properties: None,
|
|
items: None,
|
|
},
|
|
),
|
|
(
|
|
"sections_only".into(),
|
|
ToolParam {
|
|
name: "sections_only".into(),
|
|
param_type: "boolean".into(),
|
|
description: Some(
|
|
"If true, return only section headings (outline). Default: false".into(),
|
|
),
|
|
required: false,
|
|
properties: None,
|
|
items: None,
|
|
},
|
|
),
|
|
(
|
|
"include_code".into(),
|
|
ToolParam {
|
|
name: "include_code".into(),
|
|
param_type: "boolean".into(),
|
|
description: Some("Include code blocks in result. Default: true".into()),
|
|
required: false,
|
|
properties: None,
|
|
items: None,
|
|
},
|
|
),
|
|
]);
|
|
let schema = ToolSchema {
|
|
schema_type: "object".into(),
|
|
properties: Some(p),
|
|
required: Some(vec![
|
|
"project_name".into(),
|
|
"repo_name".into(),
|
|
"path".into(),
|
|
]),
|
|
};
|
|
registry.register(
|
|
ToolDefinition::new("read_markdown")
|
|
.description("Parse and analyze a Markdown file. Returns document statistics, heading outline, code blocks with languages, links, and images.")
|
|
.parameters(schema),
|
|
ToolHandler::new(|ctx, args| {
|
|
let gctx = GitToolCtx::new(ctx);
|
|
Box::pin(async move {
|
|
read_markdown_exec(gctx, args).await.map_err(agent::ToolError::ExecutionError)
|
|
})
|
|
}),
|
|
);
|
|
}
|