gitdataai/libs/agent/perception/auto.rs
ZhenYi 5c1b14c26a refactor(perception): simplify active/auto detection and deduplication
- Remove activation threshold logic from PassiveSkillAwareness
- Add SkillActivation enum with Priority/Keyword/Vector/Auto variants
- Add deduplication via SkillContext.dedupe_key() using rank ordering
- Simplify ActiveSkillAwareness with cleaner regex-based detection
2026-05-17 17:31:50 +08:00

229 lines
7.4 KiB
Rust

//! Auto skill awareness: ambient relevance matching.
use super::{SkillActivation, SkillContext, SkillEntry};
use std::collections::HashSet;
#[derive(Debug, Clone)]
pub struct AutoSkillAwareness {
/// Minimum overlap score to consider a skill relevant.
min_score: f32,
/// Maximum number of auto-selected skills.
max_skills: usize,
}
impl Default for AutoSkillAwareness {
fn default() -> Self {
Self {
min_score: 0.20,
max_skills: 3,
}
}
}
impl AutoSkillAwareness {
pub fn new(min_score: f32, max_skills: usize) -> Self {
Self {
min_score,
max_skills,
}
}
pub async fn detect(
&self,
current_input: &str,
history: &[String],
skills: &[SkillEntry],
) -> Vec<SkillContext> {
if skills.is_empty() {
return Vec::new();
}
let history_text = history
.iter()
.rev()
.take(5)
.map(String::as_str)
.collect::<Vec<_>>()
.join(" ");
let corpus = format!("{} {}", current_input, history_text).to_lowercase();
let corpus_keywords = Self::extract_keywords(&corpus);
if corpus_keywords.is_empty() {
return Vec::new();
}
let mut scored = skills
.iter()
.filter_map(|skill| {
let score = Self::score_skill(&corpus_keywords, skill);
(score >= self.min_score).then_some((score, skill))
})
.collect::<Vec<_>>();
scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
scored
.into_iter()
.take(self.max_skills)
.map(|(score, skill)| {
let excerpt = Self::best_excerpt(&corpus, skill);
SkillContext::new(skill, SkillActivation::Auto, None, excerpt, Some(score))
})
.collect()
}
fn extract_keywords(text: &str) -> HashSet<String> {
const STOPWORDS: &[&str] = &[
"the", "a", "an", "is", "are", "was", "were", "be", "been", "being", "have", "has",
"had", "do", "does", "did", "will", "would", "could", "should", "may", "might", "can",
"to", "of", "in", "for", "on", "with", "at", "by", "from", "as", "or", "and", "but",
"if", "not", "no", "so", "this", "that", "these", "those", "it", "its", "i", "you",
"we", "they", "what", "which", "who", "when", "where", "why", "how", "all", "each",
"every", "more", "most", "some", "such", "only", "same", "than", "too", "very", "just",
"also", "now", "here", "there", "then",
];
let mut terms = HashSet::new();
let mut ascii = String::new();
let mut cjk_run = String::new();
for ch in text.chars() {
if ch.is_ascii_alphanumeric() || ch == '_' || ch == '-' {
if !cjk_run.is_empty() {
Self::push_cjk_terms(&mut terms, &cjk_run);
cjk_run.clear();
}
ascii.push(ch);
} else if ('\u{4e00}'..='\u{9fff}').contains(&ch) {
if !ascii.is_empty() {
Self::push_ascii_term(&mut terms, &ascii, STOPWORDS);
ascii.clear();
}
cjk_run.push(ch);
} else {
if !ascii.is_empty() {
Self::push_ascii_term(&mut terms, &ascii, STOPWORDS);
ascii.clear();
}
if !cjk_run.is_empty() {
Self::push_cjk_terms(&mut terms, &cjk_run);
cjk_run.clear();
}
}
}
if !ascii.is_empty() {
Self::push_ascii_term(&mut terms, &ascii, STOPWORDS);
}
if !cjk_run.is_empty() {
Self::push_cjk_terms(&mut terms, &cjk_run);
}
terms
}
fn push_ascii_term(terms: &mut HashSet<String>, raw: &str, stopwords: &[&str]) {
let term = raw
.trim_matches(|c: char| !c.is_ascii_alphanumeric() && c != '_' && c != '-')
.to_lowercase();
if term.len() >= 3 && !stopwords.contains(&term.as_str()) {
terms.insert(term);
}
}
fn push_cjk_terms(terms: &mut HashSet<String>, raw: &str) {
let chars = raw.chars().collect::<Vec<_>>();
if chars.len() < 2 {
return;
}
for window in chars.windows(2) {
terms.insert(window.iter().collect());
}
if chars.len() >= 4 {
terms.insert(chars.iter().collect());
}
}
fn score_skill(corpus_keywords: &HashSet<String>, skill: &SkillEntry) -> f32 {
let skill_text = format!(
"{} {} {}",
skill.name,
skill.description.as_deref().unwrap_or(""),
skill.content.chars().take(800).collect::<String>()
)
.to_lowercase();
let skill_keywords = Self::extract_keywords(&skill_text);
if skill_keywords.is_empty() {
return 0.0;
}
let overlap = corpus_keywords
.iter()
.filter(|kw| {
skill_keywords
.iter()
.any(|sk| sk == *kw || (kw.len() >= 4 && sk.contains(kw.as_str())))
})
.count();
let denominator = corpus_keywords.len().min(skill_keywords.len()).max(1);
overlap as f32 / denominator as f32
}
fn best_excerpt(corpus: &str, skill: &SkillEntry) -> String {
let corpus_kws = Self::extract_keywords(corpus);
let best_para = skill
.content
.split('\n')
.filter(|para| !para.trim().is_empty())
.map(|para| {
let para_kws = Self::extract_keywords(&para.to_lowercase());
let overlap = corpus_kws
.iter()
.filter(|kw| {
para_kws
.iter()
.any(|pk| pk == *kw || pk.contains(kw.as_str()))
})
.count();
(overlap, para)
})
.filter(|(score, _)| *score > 0)
.max_by_key(|(score, _)| *score);
if let Some((_, para)) = best_para {
format!("# {} (auto-matched)\n\n{}", skill.name, para.trim())
} else {
let excerpt = skill.content.chars().take(300).collect::<String>();
format!("# {} (auto-matched)\n\n{}...", skill.name, excerpt.trim())
}
}
}
#[cfg(test)]
mod tests {
use super::*;
fn skill(slug: &str, name: &str, description: &str, content: &str) -> SkillEntry {
SkillEntry {
slug: slug.to_string(),
name: name.to_string(),
description: Some(description.to_string()),
content: content.to_string(),
}
}
#[tokio::test]
async fn auto_detects_chinese_without_spaces() {
let skills = vec![skill(
"code-review",
"代码审查",
"检查代码安全和性能问题",
"审查变更,发现 bug、安全漏洞和性能风险。",
)];
let found = AutoSkillAwareness::new(0.10, 3)
.detect("帮我检查这次代码安全问题", &[], &skills)
.await;
assert_eq!(found[0].slug, "code-review");
}
}