gitdataai/libs/service/skill/scanner.rs

387 lines
13 KiB
Rust

//! Repository skill scanner.
//!
//! Scans repositories for SKILL.md files and upserts skill records.
use crate::error::AppError;
use chrono::Utc;
use git2::Repository;
use models::ActiveModelTrait;
use models::projects::project_skill::ActiveModel as SkillActiveModel;
use models::projects::project_skill::Column as C;
use models::projects::project_skill::Entity as SkillEntity;
use models::repos::repo::Model as RepoModel;
use sea_orm::{ColumnTrait, EntityTrait, QueryFilter, Set};
use sha1::Digest;
use std::path::Path;
use uuid::Uuid;
/// Skill discovery result from a single repository.
#[derive(Debug)]
pub struct DiscoveredSkill {
/// URL-safe slug derived from the directory name.
pub slug: String,
/// Human-readable name (from frontmatter or slug).
pub name: String,
/// Short description (from frontmatter).
pub description: Option<String>,
/// Raw markdown body after the frontmatter.
pub content: String,
/// Parsed frontmatter as JSON.
pub metadata: serde_json::Value,
/// Git commit SHA where this skill was found (git hook path only).
pub commit_sha: Option<String>,
/// Git blob SHA-1 of the SKILL.md file.
pub blob_hash: Option<String>,
}
/// Compute the git blob SHA-1 hash of `content`.
/// Format: "blob {len}\0{data}"
fn git_blob_hash(content: &[u8]) -> String {
let size = content.len();
let header = format!("blob {}\0", size);
let mut hasher = sha1::Sha1::new();
hasher.update(header.as_bytes());
hasher.update(content);
hex::encode(hasher.finalize())
}
/// Parse a SKILL.md file and extract metadata + content.
fn parse_skill_file(slug: &str, raw: &str) -> DiscoveredSkill {
let (frontmatter, content) = extract_frontmatter(raw);
let metadata: serde_json::Value = frontmatter
.map(|fm| serde_json::from_str(fm).unwrap_or_default())
.unwrap_or_default();
let name = metadata
.get("name")
.and_then(|v| v.as_str())
.map(String::from)
.unwrap_or_else(|| slug.replace('-', " ").replace('_', " "));
let description = metadata
.get("description")
.and_then(|v| v.as_str())
.map(String::from);
DiscoveredSkill {
slug: slug.to_string(),
name,
description,
content: content.trim().to_string(),
metadata,
commit_sha: None,
blob_hash: None,
}
}
/// Split frontmatter (--- ... ---) from markdown content.
fn extract_frontmatter(raw: &str) -> (Option<&str>, &str) {
let trimmed = raw.trim_start();
if !trimmed.starts_with("---") {
return (None, trimmed);
}
if let Some(end) = trimmed[3..].find("---") {
let fm = &trimmed[3..end + 3];
let rest = trimmed[3 + end + 3..].trim_start();
(Some(fm), rest)
} else {
(None, trimmed)
}
}
/// Recursively scan `repo_path` for `SKILL.md` files (filesystem walk, non-bare repos).
/// The skill slug is `{short_repo_id}/{parent_dir_name}` to ensure uniqueness across repos.
pub fn scan_repo_for_skills(
repo_path: &Path,
repo_id: Uuid,
) -> Result<Vec<DiscoveredSkill>, AppError> {
let repo_id_prefix = &repo_id.to_string()[..8];
let mut discovered = Vec::new();
let mut stack = vec![repo_path.to_path_buf()];
while let Some(dir) = stack.pop() {
let entries = match std::fs::read_dir(&dir) {
Ok(e) => e,
Err(_) => continue,
};
for entry in entries.flatten() {
let path = entry.path();
if path.is_dir() {
stack.push(path);
} else if path
.file_name()
.and_then(|n| n.to_str())
.map(|s| s.to_lowercase())
== Some("skill.md".to_string())
{
if let Some(dir_name) = path
.parent()
.and_then(|p| p.file_name())
.and_then(|n| n.to_str())
.filter(|s| !s.starts_with('.'))
{
let slug = format!("{}/{}", repo_id_prefix, dir_name);
if let Ok(raw) = std::fs::read(&path) {
let blob_hash = git_blob_hash(&raw);
let mut skill = parse_skill_file(&slug, &String::from_utf8_lossy(&raw));
skill.blob_hash = Some(blob_hash);
discovered.push(skill);
}
}
}
}
}
Ok(discovered)
}
/// Scan git tree objects for `SKILL.md` files (works for bare repos).
/// Traverses the HEAD commit tree using libgit2, reading blob content from objects.
pub fn scan_repo_tree_for_skills(
git_repo: &Repository,
repo_id: Uuid,
) -> Result<Vec<DiscoveredSkill>, AppError> {
let repo_id_prefix = &repo_id.to_string()[..8];
let head = git_repo
.head()
.map_err(|e| AppError::InternalServerError(format!("no HEAD: {e}")))?;
let tree = head
.peel_to_tree()
.map_err(|e| AppError::InternalServerError(format!("no tree: {e}")))?;
let mut discovered = Vec::new();
// Stack: (tree, path_prefix relative to root)
let mut stack: Vec<(git2::Tree<'_>, String)> = vec![(tree, String::new())];
while let Some((current_tree, prefix)) = stack.pop() {
for entry in current_tree.iter() {
let name = match entry.name() {
Some(n) => n,
None => continue,
};
let entry_path = if prefix.is_empty() {
name.to_string()
} else {
format!("{}/{}", prefix, name)
};
match entry.kind() {
Some(git2::ObjectType::Tree) => {
if !name.starts_with('.') {
if let Ok(subtree) =
entry.to_object(git_repo).and_then(|o| o.peel_to_tree())
{
stack.push((subtree, entry_path));
}
}
}
Some(git2::ObjectType::Blob) if name.to_lowercase() == "skill.md" => {
// Derive skill name from parent directory
let dir_name = std::path::Path::new(&entry_path)
.parent()
.and_then(|p| p.file_name())
.and_then(|n| n.to_str())
.filter(|s| !s.starts_with('.'));
let Some(dir_name) = dir_name else { continue };
let slug = format!("{}/{}", repo_id_prefix, dir_name);
if let Ok(blob) = entry.to_object(git_repo).and_then(|o| o.peel_to_blob()) {
let raw = blob.content();
let blob_hash = git_blob_hash(raw);
let mut skill = parse_skill_file(&slug, &String::from_utf8_lossy(raw));
skill.blob_hash = Some(blob_hash);
discovered.push(skill);
}
}
_ => {}
}
}
}
Ok(discovered)
}
/// Scan a git2::Repository for skills and upsert them into the database.
/// Uses filesystem walk for normal repos, git tree traversal for bare repos.
pub async fn scan_and_sync_skills(
db: &db::database::AppDatabase,
project_uuid: Uuid,
repo: &RepoModel,
) -> Result<ScanSyncResult, AppError> {
// Open with git2 to get the actual workdir
let git_repo = match Repository::open(&repo.storage_path) {
Ok(r) => r,
Err(e) => {
tracing::warn!("failed to open git repo {}: {:?}", repo.storage_path, e);
return Ok(ScanSyncResult {
discovered: 0,
created: 0,
updated: 0,
removed: 0,
});
}
};
let commit_sha = git_repo
.head()
.ok()
.and_then(|h| h.target())
.map(|oid| oid.to_string());
// For bare repos (no workdir), scan git tree objects directly
let mut discovered = if git_repo.is_bare() || git_repo.workdir().is_none() {
match scan_repo_tree_for_skills(&git_repo, repo.id) {
Ok(skills) => skills,
Err(e) => {
tracing::warn!("tree scan failed for repo {}: {:?}", repo.storage_path, e);
vec![]
}
}
} else {
let workdir = git_repo.workdir().unwrap();
scan_repo_for_skills(workdir, repo.id)?
};
// Fill in commit_sha for discovered skills
for skill in &mut discovered {
skill.commit_sha = commit_sha.clone();
}
sync_discovered_skills(db, project_uuid, repo.id, discovered).await
}
/// Sync discovered skills with deduplication by {repo_id}+{blob_hash}.
async fn sync_discovered_skills(
db: &db::database::AppDatabase,
project_uuid: Uuid,
repo_id: Uuid,
discovered: Vec<DiscoveredSkill>,
) -> Result<ScanSyncResult, AppError> {
if discovered.is_empty() {
return Ok(ScanSyncResult {
discovered: 0,
created: 0,
updated: 0,
removed: 0,
});
}
let now = Utc::now();
let mut created = 0i64;
let mut updated = 0i64;
// Deduplicate by {repo_id}+{blob_hash}, keep latest by commit_sha
let mut deduped: std::collections::HashMap<String, DiscoveredSkill> =
std::collections::HashMap::new();
for skill in discovered {
let key = format!(
"{}:{}",
repo_id,
skill.blob_hash.as_ref().unwrap_or(&skill.slug)
);
match deduped.get(&key) {
Some(existing) => {
// Keep the one with the later commit_sha
if skill.commit_sha.as_ref().unwrap_or(&String::new())
> existing.commit_sha.as_ref().unwrap_or(&String::new())
{
deduped.insert(key, skill);
}
}
None => {
deduped.insert(key, skill);
}
}
}
// Query existing skills for this repo
let existing: Vec<_> = SkillEntity::find()
.filter(C::ProjectUuid.eq(project_uuid))
.filter(C::Source.eq("repo"))
.filter(C::RepoId.eq(repo_id))
.all(db)
.await?;
let existing_by_hash: std::collections::HashMap<_, _> = existing
.into_iter()
.map(|s| {
let key = format!(
"{}:{}",
s.repo_id.unwrap_or_default(),
s.blob_hash.clone().unwrap_or_default()
);
(key, s)
})
.collect();
let mut seen_keys = std::collections::HashSet::new();
let discovered_count = deduped.len() as i64;
for (key, skill) in deduped {
seen_keys.insert(key.clone());
let json_meta = serde_json::to_value(&skill.metadata).unwrap_or_default();
if let Some(existing_skill) = existing_by_hash.get(&key) {
if existing_skill.content != skill.content
|| existing_skill.metadata != json_meta
|| existing_skill.commit_sha != skill.commit_sha
{
let mut active: SkillActiveModel = existing_skill.clone().into();
active.content = Set(skill.content);
active.metadata = Set(json_meta);
active.commit_sha = Set(skill.commit_sha);
active.blob_hash = Set(skill.blob_hash);
active.updated_at = Set(now);
active.update(db).await?;
updated += 1;
}
} else {
let active = SkillActiveModel {
id: Set(0),
project_uuid: Set(project_uuid),
slug: Set(skill.slug),
name: Set(skill.name),
description: Set(skill.description),
source: Set("repo".to_string()),
repo_id: Set(Some(repo_id)),
commit_sha: Set(skill.commit_sha),
blob_hash: Set(skill.blob_hash),
content: Set(skill.content),
metadata: Set(json_meta),
enabled: Set(true),
created_by: Set(None),
created_at: Set(now),
updated_at: Set(now),
};
active.insert(db).await?;
created += 1;
}
}
// Remove skills that no longer exist in the repo
let mut removed = 0i64;
for (key, old_skill) in existing_by_hash {
if !seen_keys.contains(&key) {
SkillEntity::delete_by_id(old_skill.id).exec(db).await?;
removed += 1;
}
}
Ok(ScanSyncResult {
discovered: discovered_count,
created,
updated,
removed,
})
}
/// Result of a scan + sync operation.
#[derive(Debug)]
pub struct ScanSyncResult {
pub discovered: i64,
pub created: i64,
pub updated: i64,
pub removed: i64,
}