- Change deduplication key from slug to {repo_id}+{blob_hash}
- Keep latest version by commit_sha when duplicates found
- Use git2 to open repos and get correct workdir and commit_sha
- Fix case-insensitive SKILL.md detection in scanner
294 lines
9.6 KiB
Rust
294 lines
9.6 KiB
Rust
//! Repository skill scanner.
|
|
//!
|
|
//! Scans repositories for SKILL.md files and upserts skill records.
|
|
|
|
use crate::error::AppError;
|
|
use chrono::Utc;
|
|
use git2::Repository;
|
|
use models::ActiveModelTrait;
|
|
use models::projects::project_skill::ActiveModel as SkillActiveModel;
|
|
use models::projects::project_skill::Column as C;
|
|
use models::projects::project_skill::Entity as SkillEntity;
|
|
use models::repos::repo::Model as RepoModel;
|
|
use sea_orm::{ColumnTrait, EntityTrait, QueryFilter, Set};
|
|
use sha1::Digest;
|
|
use std::path::Path;
|
|
use uuid::Uuid;
|
|
|
|
/// Skill discovery result from a single repository.
|
|
#[derive(Debug)]
|
|
pub struct DiscoveredSkill {
|
|
/// URL-safe slug derived from the directory name.
|
|
pub slug: String,
|
|
/// Human-readable name (from frontmatter or slug).
|
|
pub name: String,
|
|
/// Short description (from frontmatter).
|
|
pub description: Option<String>,
|
|
/// Raw markdown body after the frontmatter.
|
|
pub content: String,
|
|
/// Parsed frontmatter as JSON.
|
|
pub metadata: serde_json::Value,
|
|
/// Git commit SHA where this skill was found (git hook path only).
|
|
pub commit_sha: Option<String>,
|
|
/// Git blob SHA-1 of the SKILL.md file.
|
|
pub blob_hash: Option<String>,
|
|
}
|
|
|
|
/// Compute the git blob SHA-1 hash of `content`.
|
|
/// Format: "blob {len}\0{data}"
|
|
fn git_blob_hash(content: &[u8]) -> String {
|
|
let size = content.len();
|
|
let header = format!("blob {}\0", size);
|
|
let mut hasher = sha1::Sha1::new();
|
|
hasher.update(header.as_bytes());
|
|
hasher.update(content);
|
|
hex::encode(hasher.finalize())
|
|
}
|
|
|
|
/// Parse a SKILL.md file and extract metadata + content.
|
|
fn parse_skill_file(slug: &str, raw: &str) -> DiscoveredSkill {
|
|
let (frontmatter, content) = extract_frontmatter(raw);
|
|
|
|
let metadata: serde_json::Value = frontmatter
|
|
.map(|fm| serde_json::from_str(fm).unwrap_or_default())
|
|
.unwrap_or_default();
|
|
|
|
let name = metadata
|
|
.get("name")
|
|
.and_then(|v| v.as_str())
|
|
.map(String::from)
|
|
.unwrap_or_else(|| slug.replace('-', " ").replace('_', " "));
|
|
|
|
let description = metadata
|
|
.get("description")
|
|
.and_then(|v| v.as_str())
|
|
.map(String::from);
|
|
|
|
DiscoveredSkill {
|
|
slug: slug.to_string(),
|
|
name,
|
|
description,
|
|
content: content.trim().to_string(),
|
|
metadata,
|
|
commit_sha: None,
|
|
blob_hash: None,
|
|
}
|
|
}
|
|
|
|
/// Split frontmatter (--- ... ---) from markdown content.
|
|
fn extract_frontmatter(raw: &str) -> (Option<&str>, &str) {
|
|
let trimmed = raw.trim_start();
|
|
if !trimmed.starts_with("---") {
|
|
return (None, trimmed);
|
|
}
|
|
if let Some(end) = trimmed[3..].find("---") {
|
|
let fm = &trimmed[3..end + 3];
|
|
let rest = trimmed[3 + end + 3..].trim_start();
|
|
(Some(fm), rest)
|
|
} else {
|
|
(None, trimmed)
|
|
}
|
|
}
|
|
|
|
/// Recursively scan `repo_path` for `SKILL.md` files.
|
|
/// The skill slug is `{short_repo_id}/{parent_dir_name}` to ensure uniqueness across repos.
|
|
pub fn scan_repo_for_skills(
|
|
repo_path: &Path,
|
|
repo_id: Uuid,
|
|
) -> Result<Vec<DiscoveredSkill>, AppError> {
|
|
let repo_id_prefix = &repo_id.to_string()[..8];
|
|
let mut discovered = Vec::new();
|
|
let mut stack = vec![repo_path.to_path_buf()];
|
|
|
|
while let Some(dir) = stack.pop() {
|
|
let entries = match std::fs::read_dir(&dir) {
|
|
Ok(e) => e,
|
|
Err(_) => continue,
|
|
};
|
|
for entry in entries.flatten() {
|
|
let path = entry.path();
|
|
if path.is_dir() {
|
|
stack.push(path);
|
|
} else if path
|
|
.file_name()
|
|
.and_then(|n| n.to_str())
|
|
.map(|s| s.to_lowercase())
|
|
== Some("skill.md".to_string())
|
|
{
|
|
if let Some(dir_name) = path.parent()
|
|
.and_then(|p| p.file_name())
|
|
.and_then(|n| n.to_str())
|
|
.filter(|s| !s.starts_with('.'))
|
|
{
|
|
let slug = format!("{}/{}", repo_id_prefix, dir_name);
|
|
if let Ok(raw) = std::fs::read(&path) {
|
|
let blob_hash = git_blob_hash(&raw);
|
|
let mut skill = parse_skill_file(&slug, &String::from_utf8_lossy(&raw));
|
|
skill.blob_hash = Some(blob_hash);
|
|
discovered.push(skill);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(discovered)
|
|
}
|
|
|
|
/// Scan a git2::Repository for skills and upsert them into the database.
|
|
/// Called from the git hook sync path.
|
|
pub async fn scan_and_sync_skills(
|
|
db: &db::database::AppDatabase,
|
|
project_uuid: Uuid,
|
|
repo: &RepoModel,
|
|
) -> Result<ScanSyncResult, AppError> {
|
|
// Open with git2 to get the actual workdir
|
|
let git_repo = match Repository::open(&repo.storage_path) {
|
|
Ok(r) => r,
|
|
Err(e) => {
|
|
tracing::warn!("failed to open git repo {}: {:?}", repo.storage_path, e);
|
|
return Ok(ScanSyncResult {
|
|
discovered: 0,
|
|
created: 0,
|
|
updated: 0,
|
|
removed: 0,
|
|
});
|
|
}
|
|
};
|
|
|
|
let workdir = git_repo.workdir().map(|p| p.to_path_buf()).unwrap_or_else(|| Path::new(&repo.storage_path).to_path_buf());
|
|
let commit_sha = git_repo.head().ok().and_then(|h| h.target()).map(|oid| oid.to_string());
|
|
|
|
let mut discovered = scan_repo_for_skills(&workdir, repo.id)?;
|
|
|
|
// Fill in commit_sha for discovered skills
|
|
for skill in &mut discovered {
|
|
skill.commit_sha = commit_sha.clone();
|
|
}
|
|
|
|
sync_discovered_skills(db, project_uuid, repo.id, discovered).await
|
|
}
|
|
|
|
/// Sync discovered skills with deduplication by {repo_id}+{blob_hash}.
|
|
async fn sync_discovered_skills(
|
|
db: &db::database::AppDatabase,
|
|
project_uuid: Uuid,
|
|
repo_id: Uuid,
|
|
discovered: Vec<DiscoveredSkill>,
|
|
) -> Result<ScanSyncResult, AppError> {
|
|
if discovered.is_empty() {
|
|
return Ok(ScanSyncResult {
|
|
discovered: 0,
|
|
created: 0,
|
|
updated: 0,
|
|
removed: 0,
|
|
});
|
|
}
|
|
|
|
let now = Utc::now();
|
|
let mut created = 0i64;
|
|
let mut updated = 0i64;
|
|
|
|
// Deduplicate by {repo_id}+{blob_hash}, keep latest by commit_sha
|
|
let mut deduped: std::collections::HashMap<String, DiscoveredSkill> = std::collections::HashMap::new();
|
|
for skill in discovered {
|
|
let key = format!("{}:{}", repo_id, skill.blob_hash.as_ref().unwrap_or(&skill.slug));
|
|
match deduped.get(&key) {
|
|
Some(existing) => {
|
|
// Keep the one with the later commit_sha
|
|
if skill.commit_sha.as_ref().unwrap_or(&String::new()) > existing.commit_sha.as_ref().unwrap_or(&String::new()) {
|
|
deduped.insert(key, skill);
|
|
}
|
|
}
|
|
None => {
|
|
deduped.insert(key, skill);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Query existing skills for this repo
|
|
let existing: Vec<_> = SkillEntity::find()
|
|
.filter(C::ProjectUuid.eq(project_uuid))
|
|
.filter(C::Source.eq("repo"))
|
|
.filter(C::RepoId.eq(repo_id))
|
|
.all(db)
|
|
.await?;
|
|
|
|
let existing_by_hash: std::collections::HashMap<_, _> = existing
|
|
.into_iter()
|
|
.map(|s| {
|
|
let key = format!("{}:{}", s.repo_id.unwrap_or_default(), s.blob_hash.clone().unwrap_or_default());
|
|
(key, s)
|
|
})
|
|
.collect();
|
|
|
|
let mut seen_keys = std::collections::HashSet::new();
|
|
|
|
let discovered_count = deduped.len() as i64;
|
|
for (key, skill) in deduped {
|
|
seen_keys.insert(key.clone());
|
|
let json_meta = serde_json::to_value(&skill.metadata).unwrap_or_default();
|
|
|
|
if let Some(existing_skill) = existing_by_hash.get(&key) {
|
|
if existing_skill.content != skill.content
|
|
|| existing_skill.metadata != json_meta
|
|
|| existing_skill.commit_sha != skill.commit_sha
|
|
{
|
|
let mut active: SkillActiveModel = existing_skill.clone().into();
|
|
active.content = Set(skill.content);
|
|
active.metadata = Set(json_meta);
|
|
active.commit_sha = Set(skill.commit_sha);
|
|
active.blob_hash = Set(skill.blob_hash);
|
|
active.updated_at = Set(now);
|
|
active.update(db).await?;
|
|
updated += 1;
|
|
}
|
|
} else {
|
|
let active = SkillActiveModel {
|
|
id: Set(0),
|
|
project_uuid: Set(project_uuid),
|
|
slug: Set(skill.slug),
|
|
name: Set(skill.name),
|
|
description: Set(skill.description),
|
|
source: Set("repo".to_string()),
|
|
repo_id: Set(Some(repo_id)),
|
|
commit_sha: Set(skill.commit_sha),
|
|
blob_hash: Set(skill.blob_hash),
|
|
content: Set(skill.content),
|
|
metadata: Set(json_meta),
|
|
enabled: Set(true),
|
|
created_by: Set(None),
|
|
created_at: Set(now),
|
|
updated_at: Set(now),
|
|
};
|
|
active.insert(db).await?;
|
|
created += 1;
|
|
}
|
|
}
|
|
|
|
// Remove skills that no longer exist in the repo
|
|
let mut removed = 0i64;
|
|
for (key, old_skill) in existing_by_hash {
|
|
if !seen_keys.contains(&key) {
|
|
SkillEntity::delete_by_id(old_skill.id).exec(db).await?;
|
|
removed += 1;
|
|
}
|
|
}
|
|
|
|
Ok(ScanSyncResult {
|
|
discovered: discovered_count,
|
|
created,
|
|
updated,
|
|
removed,
|
|
})
|
|
}
|
|
|
|
/// Result of a scan + sync operation.
|
|
#[derive(Debug)]
|
|
pub struct ScanSyncResult {
|
|
pub discovered: i64,
|
|
pub created: i64,
|
|
pub updated: i64,
|
|
pub removed: i64,
|
|
}
|