fix(skill): deduplicate skills by repo_id+blob_hash

- Change deduplication key from slug to {repo_id}+{blob_hash}
- Keep latest version by commit_sha when duplicates found
- Use git2 to open repos and get correct workdir and commit_sha
- Fix case-insensitive SKILL.md detection in scanner
This commit is contained in:
ZhenYi 2026-04-28 21:27:38 +08:00
parent 18917b6de1
commit 2db7934596
2 changed files with 83 additions and 29 deletions

View File

@ -24,17 +24,16 @@ impl AppService {
project_uid: Uuid,
_caller_uid: Uuid,
) -> Result<scanner::ScanSyncResult, AppError> {
// Collect all repo IDs for this project
let repos: Vec<_> = RepoEntity::find()
.filter(RCol::Project.eq(project_uid))
.all(&self.db)
.await?;
let mut total_created = 0i64;
let mut total_updated = 0i64;
let mut total_removed = 0i64;
let mut total_discovered = 0i64;
let repos: Vec<_> = RepoEntity::find()
.filter(RCol::Project.eq(project_uid))
.all(&self.db)
.await?;
for repo in repos {
let result = scanner::scan_and_sync_skills(&self.db, project_uid, &repo).await?;
total_created += result.created;

View File

@ -4,6 +4,7 @@
use crate::error::AppError;
use chrono::Utc;
use git2::Repository;
use models::ActiveModelTrait;
use models::projects::project_skill::ActiveModel as SkillActiveModel;
use models::projects::project_skill::Column as C;
@ -108,7 +109,12 @@ pub fn scan_repo_for_skills(
let path = entry.path();
if path.is_dir() {
stack.push(path);
} else if path.file_name().and_then(|n| n.to_str()) == Some("SKILL.md") {
} else if path
.file_name()
.and_then(|n| n.to_str())
.map(|s| s.to_lowercase())
== Some("skill.md".to_string())
{
if let Some(dir_name) = path.parent()
.and_then(|p| p.file_name())
.and_then(|n| n.to_str())
@ -136,10 +142,40 @@ pub async fn scan_and_sync_skills(
project_uuid: Uuid,
repo: &RepoModel,
) -> Result<ScanSyncResult, AppError> {
// Resolve the repo path
let storage_path = Path::new(&repo.storage_path);
let discovered = scan_repo_for_skills(storage_path, repo.id)?;
// Open with git2 to get the actual workdir
let git_repo = match Repository::open(&repo.storage_path) {
Ok(r) => r,
Err(e) => {
tracing::warn!("failed to open git repo {}: {:?}", repo.storage_path, e);
return Ok(ScanSyncResult {
discovered: 0,
created: 0,
updated: 0,
removed: 0,
});
}
};
let workdir = git_repo.workdir().map(|p| p.to_path_buf()).unwrap_or_else(|| Path::new(&repo.storage_path).to_path_buf());
let commit_sha = git_repo.head().ok().and_then(|h| h.target()).map(|oid| oid.to_string());
let mut discovered = scan_repo_for_skills(&workdir, repo.id)?;
// Fill in commit_sha for discovered skills
for skill in &mut discovered {
skill.commit_sha = commit_sha.clone();
}
sync_discovered_skills(db, project_uuid, repo.id, discovered).await
}
/// Sync discovered skills with deduplication by {repo_id}+{blob_hash}.
async fn sync_discovered_skills(
db: &db::database::AppDatabase,
project_uuid: Uuid,
repo_id: Uuid,
discovered: Vec<DiscoveredSkill>,
) -> Result<ScanSyncResult, AppError> {
if discovered.is_empty() {
return Ok(ScanSyncResult {
discovered: 0,
@ -153,37 +189,56 @@ pub async fn scan_and_sync_skills(
let mut created = 0i64;
let mut updated = 0i64;
// Collect all repo-sourced skills in this repo for this project
// Deduplicate by {repo_id}+{blob_hash}, keep latest by commit_sha
let mut deduped: std::collections::HashMap<String, DiscoveredSkill> = std::collections::HashMap::new();
for skill in discovered {
let key = format!("{}:{}", repo_id, skill.blob_hash.as_ref().unwrap_or(&skill.slug));
match deduped.get(&key) {
Some(existing) => {
// Keep the one with the later commit_sha
if skill.commit_sha.as_ref().unwrap_or(&String::new()) > existing.commit_sha.as_ref().unwrap_or(&String::new()) {
deduped.insert(key, skill);
}
}
None => {
deduped.insert(key, skill);
}
}
}
// Query existing skills for this repo
let existing: Vec<_> = SkillEntity::find()
.filter(C::ProjectUuid.eq(project_uuid))
.filter(C::Source.eq("repo"))
.filter(C::RepoId.eq(repo.id))
.filter(C::RepoId.eq(repo_id))
.all(db)
.await?;
let existing_by_slug: std::collections::HashMap<_, _> = existing
let existing_by_hash: std::collections::HashMap<_, _> = existing
.into_iter()
.map(|s| (s.slug.clone(), s))
.map(|s| {
let key = format!("{}:{}", s.repo_id.unwrap_or_default(), s.blob_hash.clone().unwrap_or_default());
(key, s)
})
.collect();
let mut seen_slugs = std::collections::HashSet::new();
let discovered_count = discovered.len() as i64;
for skill in discovered {
seen_slugs.insert(skill.slug.clone());
let mut seen_keys = std::collections::HashSet::new();
let discovered_count = deduped.len() as i64;
for (key, skill) in deduped {
seen_keys.insert(key.clone());
let json_meta = serde_json::to_value(&skill.metadata).unwrap_or_default();
if let Some(existing_skill) = existing_by_slug.get(&skill.slug) {
if let Some(existing_skill) = existing_by_hash.get(&key) {
if existing_skill.content != skill.content
|| existing_skill.metadata != json_meta
|| existing_skill.blob_hash != skill.blob_hash
|| existing_skill.commit_sha != skill.commit_sha
{
let mut active: SkillActiveModel = existing_skill.clone().into();
active.content = Set(skill.content);
active.metadata = Set(json_meta);
active.commit_sha = Set(skill.commit_sha.clone());
active.blob_hash = Set(skill.blob_hash.clone());
active.commit_sha = Set(skill.commit_sha);
active.blob_hash = Set(skill.blob_hash);
active.updated_at = Set(now);
active.update(db).await?;
updated += 1;
@ -192,13 +247,13 @@ pub async fn scan_and_sync_skills(
let active = SkillActiveModel {
id: Set(0),
project_uuid: Set(project_uuid),
slug: Set(skill.slug.clone()),
slug: Set(skill.slug),
name: Set(skill.name),
description: Set(skill.description),
source: Set("repo".to_string()),
repo_id: Set(Some(repo.id)),
commit_sha: Set(skill.commit_sha.clone()),
blob_hash: Set(skill.blob_hash.clone()),
repo_id: Set(Some(repo_id)),
commit_sha: Set(skill.commit_sha),
blob_hash: Set(skill.blob_hash),
content: Set(skill.content),
metadata: Set(json_meta),
enabled: Set(true),
@ -213,8 +268,8 @@ pub async fn scan_and_sync_skills(
// Remove skills that no longer exist in the repo
let mut removed = 0i64;
for (slug, old_skill) in existing_by_slug {
if !seen_slugs.contains(&slug) {
for (key, old_skill) in existing_by_hash {
if !seen_keys.contains(&key) {
SkillEntity::delete_by_id(old_skill.id).exec(db).await?;
removed += 1;
}