- Remove activation threshold logic from PassiveSkillAwareness - Add SkillActivation enum with Priority/Keyword/Vector/Auto variants - Add deduplication via SkillContext.dedupe_key() using rank ordering - Simplify ActiveSkillAwareness with cleaner regex-based detection
229 lines
7.4 KiB
Rust
229 lines
7.4 KiB
Rust
//! Auto skill awareness: ambient relevance matching.
|
|
|
|
use super::{SkillActivation, SkillContext, SkillEntry};
|
|
use std::collections::HashSet;
|
|
|
|
#[derive(Debug, Clone)]
|
|
pub struct AutoSkillAwareness {
|
|
/// Minimum overlap score to consider a skill relevant.
|
|
min_score: f32,
|
|
/// Maximum number of auto-selected skills.
|
|
max_skills: usize,
|
|
}
|
|
|
|
impl Default for AutoSkillAwareness {
|
|
fn default() -> Self {
|
|
Self {
|
|
min_score: 0.20,
|
|
max_skills: 3,
|
|
}
|
|
}
|
|
}
|
|
|
|
impl AutoSkillAwareness {
|
|
pub fn new(min_score: f32, max_skills: usize) -> Self {
|
|
Self {
|
|
min_score,
|
|
max_skills,
|
|
}
|
|
}
|
|
|
|
pub async fn detect(
|
|
&self,
|
|
current_input: &str,
|
|
history: &[String],
|
|
skills: &[SkillEntry],
|
|
) -> Vec<SkillContext> {
|
|
if skills.is_empty() {
|
|
return Vec::new();
|
|
}
|
|
|
|
let history_text = history
|
|
.iter()
|
|
.rev()
|
|
.take(5)
|
|
.map(String::as_str)
|
|
.collect::<Vec<_>>()
|
|
.join(" ");
|
|
let corpus = format!("{} {}", current_input, history_text).to_lowercase();
|
|
let corpus_keywords = Self::extract_keywords(&corpus);
|
|
if corpus_keywords.is_empty() {
|
|
return Vec::new();
|
|
}
|
|
|
|
let mut scored = skills
|
|
.iter()
|
|
.filter_map(|skill| {
|
|
let score = Self::score_skill(&corpus_keywords, skill);
|
|
(score >= self.min_score).then_some((score, skill))
|
|
})
|
|
.collect::<Vec<_>>();
|
|
|
|
scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
|
|
|
|
scored
|
|
.into_iter()
|
|
.take(self.max_skills)
|
|
.map(|(score, skill)| {
|
|
let excerpt = Self::best_excerpt(&corpus, skill);
|
|
SkillContext::new(skill, SkillActivation::Auto, None, excerpt, Some(score))
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
fn extract_keywords(text: &str) -> HashSet<String> {
|
|
const STOPWORDS: &[&str] = &[
|
|
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being", "have", "has",
|
|
"had", "do", "does", "did", "will", "would", "could", "should", "may", "might", "can",
|
|
"to", "of", "in", "for", "on", "with", "at", "by", "from", "as", "or", "and", "but",
|
|
"if", "not", "no", "so", "this", "that", "these", "those", "it", "its", "i", "you",
|
|
"we", "they", "what", "which", "who", "when", "where", "why", "how", "all", "each",
|
|
"every", "more", "most", "some", "such", "only", "same", "than", "too", "very", "just",
|
|
"also", "now", "here", "there", "then",
|
|
];
|
|
|
|
let mut terms = HashSet::new();
|
|
let mut ascii = String::new();
|
|
let mut cjk_run = String::new();
|
|
|
|
for ch in text.chars() {
|
|
if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' {
|
|
if !cjk_run.is_empty() {
|
|
Self::push_cjk_terms(&mut terms, &cjk_run);
|
|
cjk_run.clear();
|
|
}
|
|
ascii.push(ch);
|
|
} else if ('\u{4e00}'..='\u{9fff}').contains(&ch) {
|
|
if !ascii.is_empty() {
|
|
Self::push_ascii_term(&mut terms, &ascii, STOPWORDS);
|
|
ascii.clear();
|
|
}
|
|
cjk_run.push(ch);
|
|
} else {
|
|
if !ascii.is_empty() {
|
|
Self::push_ascii_term(&mut terms, &ascii, STOPWORDS);
|
|
ascii.clear();
|
|
}
|
|
if !cjk_run.is_empty() {
|
|
Self::push_cjk_terms(&mut terms, &cjk_run);
|
|
cjk_run.clear();
|
|
}
|
|
}
|
|
}
|
|
|
|
if !ascii.is_empty() {
|
|
Self::push_ascii_term(&mut terms, &ascii, STOPWORDS);
|
|
}
|
|
if !cjk_run.is_empty() {
|
|
Self::push_cjk_terms(&mut terms, &cjk_run);
|
|
}
|
|
|
|
terms
|
|
}
|
|
|
|
fn push_ascii_term(terms: &mut HashSet<String>, raw: &str, stopwords: &[&str]) {
|
|
let term = raw
|
|
.trim_matches(|c: char| !c.is_ascii_alphanumeric() && c != '_' && c != '-')
|
|
.to_lowercase();
|
|
if term.len() >= 3 && !stopwords.contains(&term.as_str()) {
|
|
terms.insert(term);
|
|
}
|
|
}
|
|
|
|
fn push_cjk_terms(terms: &mut HashSet<String>, raw: &str) {
|
|
let chars = raw.chars().collect::<Vec<_>>();
|
|
if chars.len() < 2 {
|
|
return;
|
|
}
|
|
for window in chars.windows(2) {
|
|
terms.insert(window.iter().collect());
|
|
}
|
|
if chars.len() >= 4 {
|
|
terms.insert(chars.iter().collect());
|
|
}
|
|
}
|
|
|
|
fn score_skill(corpus_keywords: &HashSet<String>, skill: &SkillEntry) -> f32 {
|
|
let skill_text = format!(
|
|
"{} {} {}",
|
|
skill.name,
|
|
skill.description.as_deref().unwrap_or(""),
|
|
skill.content.chars().take(800).collect::<String>()
|
|
)
|
|
.to_lowercase();
|
|
let skill_keywords = Self::extract_keywords(&skill_text);
|
|
|
|
if skill_keywords.is_empty() {
|
|
return 0.0;
|
|
}
|
|
|
|
let overlap = corpus_keywords
|
|
.iter()
|
|
.filter(|kw| {
|
|
skill_keywords
|
|
.iter()
|
|
.any(|sk| sk == *kw || (kw.len() >= 4 && sk.contains(kw.as_str())))
|
|
})
|
|
.count();
|
|
let denominator = corpus_keywords.len().min(skill_keywords.len()).max(1);
|
|
overlap as f32 / denominator as f32
|
|
}
|
|
|
|
fn best_excerpt(corpus: &str, skill: &SkillEntry) -> String {
|
|
let corpus_kws = Self::extract_keywords(corpus);
|
|
let best_para = skill
|
|
.content
|
|
.split('\n')
|
|
.filter(|para| !para.trim().is_empty())
|
|
.map(|para| {
|
|
let para_kws = Self::extract_keywords(¶.to_lowercase());
|
|
let overlap = corpus_kws
|
|
.iter()
|
|
.filter(|kw| {
|
|
para_kws
|
|
.iter()
|
|
.any(|pk| pk == *kw || pk.contains(kw.as_str()))
|
|
})
|
|
.count();
|
|
(overlap, para)
|
|
})
|
|
.filter(|(score, _)| *score > 0)
|
|
.max_by_key(|(score, _)| *score);
|
|
|
|
if let Some((_, para)) = best_para {
|
|
format!("# {} (auto-matched)\n\n{}", skill.name, para.trim())
|
|
} else {
|
|
let excerpt = skill.content.chars().take(300).collect::<String>();
|
|
format!("# {} (auto-matched)\n\n{}...", skill.name, excerpt.trim())
|
|
}
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
fn skill(slug: &str, name: &str, description: &str, content: &str) -> SkillEntry {
|
|
SkillEntry {
|
|
slug: slug.to_string(),
|
|
name: name.to_string(),
|
|
description: Some(description.to_string()),
|
|
content: content.to_string(),
|
|
}
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn auto_detects_chinese_without_spaces() {
|
|
let skills = vec![skill(
|
|
"code-review",
|
|
"代码审查",
|
|
"检查代码安全和性能问题",
|
|
"审查变更,发现 bug、安全漏洞和性能风险。",
|
|
)];
|
|
let found = AutoSkillAwareness::new(0.10, 3)
|
|
.detect("帮我检查这次代码安全问题", &[], &skills)
|
|
.await;
|
|
assert_eq!(found[0].slug, "code-review");
|
|
}
|
|
}
|