gitdataai/libs/agent/perception/auto.rs

//! Auto skill awareness — background scanning for skill relevance.
//!
//! Periodically (or on-demand) scans the conversation context to identify
//! which enabled skills might be relevant, based on keyword overlap between
//! the skill's metadata (name, description, content snippets) and the
//! conversation text.
//!
//! This is the "ambient awareness" mode — the agent is always aware of
//! which skills might apply without the user explicitly invoking them.

use super::{SkillContext, SkillEntry};

/// Auto skill awareness config.
#[derive(Debug, Clone)]
pub struct AutoSkillAwareness {
    /// Minimum keyword overlap score (0.0–1.0) to consider a skill relevant.
    min_score: f32,
    /// Maximum number of skills to inject via auto-awareness.
    max_skills: usize,
}

impl Default for AutoSkillAwareness {
    fn default() -> Self {
        Self {
            min_score: 0.15,
            max_skills: 3,
        }
    }
}

impl AutoSkillAwareness {
    pub fn new(min_score: f32, max_skills: usize) -> Self {
        Self { min_score, max_skills }
    }

    /// Detect relevant skills by scoring keyword overlap between skill metadata
    /// and the conversation text (current input + recent history).
    ///
    /// Returns up to `max_skills` skills sorted by relevance score.
    pub async fn detect(
        &self,
        current_input: &str,
        history: &[String],
        skills: &[SkillEntry],
    ) -> Vec<SkillContext> {
        if skills.is_empty() {
            return Vec::new();
        }

        // Build a combined corpus from current input and recent history (last 5 messages).
        let history_text: String = history
            .iter()
            .rev()
            .take(5)
            .map(|s| s.as_str())
            .collect::<Vec<_>>()
            .join(" ");

        let corpus = format!("{} {}", current_input, history_text).to_lowercase();

        // Extract keywords from the corpus (split on whitespace + strip punctuation).
        let corpus_keywords = Self::extract_keywords(&corpus);

        if corpus_keywords.is_empty() {
            return Vec::new();
        }

        // Score each skill.
        let mut scored: Vec<_> = skills
            .iter()
            .map(|skill| {
                let score = Self::score_skill(&corpus_keywords, skill);
                (score, skill)
            })
            .filter(|(score, _)| *score >= self.min_score)
            .collect();

        // Sort descending by score.
        scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));

        scored
            .into_iter()
            .take(self.max_skills)
            .map(|(_, skill)| {
                // Extract a short relevant excerpt around the first keyword match.
                let excerpt = Self::best_excerpt(&corpus, skill);
                SkillContext {
                    label: format!("Auto skill: {}", skill.name),
                    content: excerpt,
                }
            })
            .collect()
    }

    /// Extract meaningful keywords from text.
    fn extract_keywords(text: &str) -> Vec<String> {
        // Common English + Chinese stopwords to filter out.
        const STOPWORDS: &[&str] = &[
            "the", "a", "an", "is", "are", "was", "were", "be", "been", "being",
            "have", "has", "had", "do", "does", "did", "will", "would", "could",
            "should", "may", "might", "can", "to", "of", "in", "for", "on", "with",
            "at", "by", "from", "as", "or", "and", "but", "if", "not", "no", "so",
            "this", "that", "these", "those", "it", "its", "i", "you", "he", "she",
            "we", "they", "what", "which", "who", "when", "where", "why", "how",
            "all", "each", "every", "both", "few", "more", "most", "other", "some",
            "such", "only", "own", "same", "than", "too", "very", "just", "also",
            "now", "here", "there", "then", "once", "again", "always", "ever",
            "的", "了", "是", "在", "我", "你", "他", "她", "它", "们", "这", "那",
            "个", "一", "上", "下", "来", "去", "说", "看", "想", "要", "会", "能",
            "和", "与", "或", "不", "就", "也", "都", "还", "从", "到", "把", "被",
            "让", "给", "用", "做", "为", "以", "及", "等", "很", "太", "比较",
        ];

        text.split_whitespace()
            .filter(|w| {
                let w_clean = w.trim_matches(|c: char| !c.is_alphanumeric());
                w_clean.len() >= 3 && !STOPWORDS.contains(&w_clean)
            })
            .map(|w| w.to_lowercase())
            .collect()
    }

    /// Score a skill by keyword overlap between the corpus keywords and the skill's
    /// name + description + content (first 500 chars).
    fn score_skill(corpus_keywords: &[String], skill: &SkillEntry) -> f32 {
        let skill_text = format!(
            "{} {}",
            skill.name,
            skill.description.as_deref().unwrap_or("")
        );
        let skill_text = skill_text.to_lowercase();
        let skill_keywords = Self::extract_keywords(&skill_text);
        let content_sample = skill.content.chars().take(500).collect::<String>().to_lowercase();
        let content_keywords = Self::extract_keywords(&content_sample);
        let all_skill_keywords = [&skill_keywords[..], &content_keywords[..]].concat();

        if all_skill_keywords.is_empty() {
            return 0.0;
        }

        let overlap: usize = corpus_keywords
            .iter()
            .filter(|kw| all_skill_keywords.iter().any(|sk| sk.contains(kw.as_str()) || kw.as_str().contains(sk.as_str())))
            .count();

        overlap as f32 / all_skill_keywords.len().max(1) as f32
    }

    /// Extract the best excerpt from skill content — the paragraph most relevant to the corpus.
    fn best_excerpt(corpus: &str, skill: &SkillEntry) -> String {
        // Try to find a relevant paragraph: one that shares the most keywords with corpus.
        let corpus_kws = Self::extract_keywords(corpus);

        let best_para = skill
            .content
            .split('\n')
            .filter(|para| !para.trim().is_empty())
            .map(|para| {
                let para_kws = Self::extract_keywords(&para.to_lowercase());
                let overlap: usize = corpus_kws
                    .iter()
                    .filter(|kw| para_kws.iter().any(|pk| pk.contains(kw.as_str()) || kw.as_str().contains(pk.as_str())))
                    .count();
                (overlap, para)
            })
            .filter(|(score, _)| *score > 0)
            .max_by_key(|(score, _)| *score);

        if let Some((_, para)) = best_para {
            // Return the best paragraph with a header.
            format!("# {} (auto-matched)\n\n{}", skill.name, para.trim())
        } else {
            // Fallback: use first 300 chars of content as excerpt.
            let excerpt = skill.content.chars().take(300).collect::<String>();
            format!("# {} (auto-matched)\n\n{}...", skill.name, excerpt.trim())
        }
    }
}