gitdataai/libs/agent/embed/service.rs

use async_trait::async_trait;
use qdrant_client::qdrant::Filter;
use sea_orm::DatabaseConnection;
use std::sync::Arc;

use super::client::{EmbedClient, EmbedPayload, EmbedVector, SearchResult};

/// Maximum characters per chunk for embedding (approximates token limit).
/// text-embedding-3-small: 8192 token limit.
/// For CJK ~1 char/token, for English ~4 chars/token.
/// Conservative limit: 7000 chars to leave room for all languages.
const MAX_CHUNK_CHARS: usize = 7000;

#[async_trait]
pub trait Embeddable {
    fn entity_type(&self) -> &'static str;
    fn to_text(&self) -> String;
    fn entity_id(&self) -> String;
}

/// Split long text into chunks at paragraph/sentence boundaries.
/// Returns at least one chunk even for empty text.
/// Safe for multi-byte characters (uses char indices, not byte indices).
fn chunk_text(text: &str) -> Vec<String> {
    if text.is_empty() {
        return vec![String::new()];
    }
    if text.len() <= MAX_CHUNK_CHARS {
        return vec![text.to_string()];
    }

    // Collect char boundary byte positions
    let char_indices: Vec<usize> = text.char_indices().map(|(i, _)| i).collect();
    let total_chars = char_indices.len();

    let mut chunks = Vec::new();
    let mut start_idx = 0; // char index

    while start_idx < total_chars {
        // Start byte offset
        let byte_start = char_indices[start_idx];

        // Find end char index: at most MAX_CHUNK_CHARS characters
        let end_char_idx = (start_idx + MAX_CHUNK_CHARS).min(total_chars);
        let byte_end_candidate = char_indices[end_char_idx - 1] + text[char_indices[end_char_idx - 1]..].chars().next().map(|c| c.len_utf8()).unwrap_or(1);

        if end_char_idx >= total_chars {
            chunks.push(text[byte_start..].to_string());
            break;
        }

        // Try to break at paragraph or sentence boundary in the allowed range
        let search_range = &text[byte_start..byte_end_candidate];
        let break_at = if let Some(pos) = search_range.rfind("\n\n") {
            Some(pos + 2) // after the paragraph break
        } else if let Some(pos) = search_range.rfind('\n') {
            Some(pos + 1)
        } else if let Some(pos) = search_range.rfind(". ") {
            Some(pos + 1)
        } else if let Some(pos) = search_range.rfind("! ") {
            Some(pos + 1)
        } else if let Some(pos) = search_range.rfind("? ") {
            Some(pos + 1)
        } else {
            None
        };

        if let Some(offset) = break_at {
            let byte_end = byte_start + offset;
            chunks.push(text[byte_start..byte_end].to_string());
            // Advance char index to match the byte break
            let mut advance = start_idx + 1;
            while advance < total_chars && char_indices[advance] < byte_end {
                advance += 1;
            }
            start_idx = advance;
        } else {
            // Hard break at char boundary
            chunks.push(text[byte_start..byte_end_candidate].to_string());
            start_idx = end_char_idx;
        }
    }

    chunks
}

#[derive(Clone)]
pub struct EmbedService {
    client: Arc<EmbedClient>,
    db: DatabaseConnection,
    model_name: String,
    dimensions: u64,
}

impl EmbedService {
    pub fn new(
        client: EmbedClient,
        db: DatabaseConnection,
        model_name: String,
        dimensions: u64,
    ) -> Self {
        Self {
            client: Arc::new(client),
            db,
            model_name,
            dimensions,
        }
    }

    pub async fn embed_issue(
        &self,
        id: &str,
        title: &str,
        body: Option<&str>,
    ) -> crate::Result<()> {
        let text = match body {
            Some(b) if !b.is_empty() => format!("{}\n\n{}", title, b),
            _ => title.to_string(),
        };

        tracing::debug!(issue_id = %id, text_len = text.len(), "embed_issue: calling embedding API");
        let vector = self.client.embed_text(&text, &self.model_name).await?;
        tracing::debug!(issue_id = %id, vec_dim = vector.len(), "embed_issue: embedding done");

        let point = EmbedVector {
            id: id.to_string(),
            vector,
            payload: EmbedPayload {
                entity_type: "issue".to_string(),
                entity_id: id.to_string(),
                text,
                extra: None,
            },
        };

        self.client.upsert(vec![point]).await?;
        tracing::info!(issue_id = %id, "embed_issue: upsert complete");
        Ok(())
    }

    pub async fn embed_repo(
        &self,
        id: &str,
        name: &str,
        description: Option<&str>,
    ) -> crate::Result<()> {
        let text = match description {
            Some(d) if !d.is_empty() => format!("{}: {}", name, d),
            _ => name.to_string(),
        };

        tracing::debug!(repo_id = %id, text_len = text.len(), "embed_repo: calling embedding API");
        let vector = self.client.embed_text(&text, &self.model_name).await?;
        tracing::debug!(repo_id = %id, vec_dim = vector.len(), "embed_repo: embedding done");

        let point = EmbedVector {
            id: id.to_string(),
            vector,
            payload: EmbedPayload {
                entity_type: "repo".to_string(),
                entity_id: id.to_string(),
                text,
                extra: None,
            },
        };

        self.client.upsert(vec![point]).await?;
        tracing::info!(repo_id = %id, "embed_repo: upsert complete");
        Ok(())
    }

    pub async fn embed_issues<T: Embeddable + Send + Sync>(
        &self,
        items: Vec<T>,
    ) -> crate::Result<()> {
        if items.is_empty() {
            return Ok(());
        }

        let texts: Vec<String> = items.iter().map(|i| i.to_text()).collect();
        tracing::debug!(count = texts.len(), "embed_issues: calling embed_batch");
        let embeddings = self.client.embed_batch(&texts, &self.model_name).await?;
        tracing::debug!(count = embeddings.len(), "embed_issues: batch done");

        let points: Vec<EmbedVector> = items
            .into_iter()
            .zip(embeddings.into_iter())
            .map(|(item, vector)| EmbedVector {
                id: item.entity_id(),
                vector,
                payload: EmbedPayload {
                    entity_type: item.entity_type().to_string(),
                    entity_id: item.entity_id(),
                    text: item.to_text(),
                    extra: None,
                },
            })
            .collect();

        let count = points.len();
        self.client.upsert(points).await?;
        tracing::info!(count = count, "embed_issues: upsert complete");
        Ok(())
    }

    pub async fn search_issues(
        &self,
        query: &str,
        limit: usize,
    ) -> crate::Result<Vec<SearchResult>> {
        self.client
            .search(query, "issue", &self.model_name, limit)
            .await
    }

    pub async fn search_repos(
        &self,
        query: &str,
        limit: usize,
    ) -> crate::Result<Vec<SearchResult>> {
        self.client
            .search(query, "repo", &self.model_name, limit)
            .await
    }

    pub async fn search_issues_filtered(
        &self,
        query: &str,
        limit: usize,
        filter: Filter,
    ) -> crate::Result<Vec<SearchResult>> {
        self.client
            .search_with_filter(query, "issue", &self.model_name, limit, filter)
            .await
    }

    pub async fn delete_issue_embedding(&self, issue_id: &str) -> crate::Result<()> {
        self.client.delete_by_entity_id("issue", issue_id).await
    }

    pub async fn delete_repo_embedding(&self, repo_id: &str) -> crate::Result<()> {
        self.client.delete_by_entity_id("repo", repo_id).await
    }

    pub async fn ensure_collections(&self) -> crate::Result<()> {
        self.client
            .ensure_collection("issue", self.dimensions)
            .await?;
        self.client
            .ensure_collection("repo", self.dimensions)
            .await?;
        self.client.ensure_skill_collection(self.dimensions).await?;
        self.client
            .ensure_collection("repo_tag", self.dimensions)
            .await?;
        // Room memory collections are created per-room on first embed
        Ok(())
    }

    pub fn db(&self) -> &DatabaseConnection {
        &self.db
    }

    pub fn client(&self) -> &Arc<EmbedClient> {
        &self.client
    }

    /// Embed a project skill into Qdrant for vector-based semantic search.
    pub async fn embed_skill(
        &self,
        skill_id: i64,
        name: &str,
        description: Option<&str>,
        content: &str,
        project_uuid: &str,
    ) -> crate::Result<()> {
        let desc = description.unwrap_or_default();
        let id = skill_id.to_string();

        tracing::debug!(skill_id = %skill_id, name = %name, content_len = content.len(), "embed_skill: starting");

        // Auto-chunk long content
        let texts = chunk_text(content);
        tracing::debug!(skill_id = %skill_id, chunks = texts.len(), "embed_skill: chunked");

        if texts.len() == 1 {
            self.client
                .embed_skill(&id, name, desc, content, project_uuid, &self.model_name)
                .await?;
        } else {
            // Multi-chunk: embed each chunk with chunk_index metadata
            let full_texts: Vec<String> = texts.iter().map(|t| format!("{}: {} {}", name, desc, t)).collect();
            tracing::debug!(skill_id = %skill_id, "embed_skill: calling embed_batch");
            let embeddings = self.client.embed_batch(&full_texts, &self.model_name).await?;

            let points: Vec<EmbedVector> = embeddings.into_iter().enumerate().map(|(i, vector)| {
                EmbedVector {
                    id: format!("{}:chunk:{}", id, i),
                    vector,
                    payload: EmbedPayload {
                        entity_type: "skill".to_string(),
                        entity_id: project_uuid.to_string(),
                        text: texts[i].clone(),
                        extra: serde_json::json!({
                            "name": name,
                            "description": desc,
                            "chunk_index": i,
                            "total_chunks": texts.len(),
                        }).into(),
                    },
                }
            }).collect();

            self.client.upsert(points).await?;
        }
        tracing::info!(skill_id = %skill_id, chunks = texts.len(), "embed_skill: complete");
        Ok(())
    }

    /// Embed an issue with auto-chunking for long content.
    pub async fn embed_issue_chunked(
        &self,
        id: &str,
        title: &str,
        body: Option<&str>,
    ) -> crate::Result<()> {
        let text = match body {
            Some(b) if !b.is_empty() => format!("{}\n\n{}", title, b),
            _ => title.to_string(),
        };

        let chunks = chunk_text(&text);
        if chunks.len() == 1 {
            return self.embed_issue(id, title, body).await;
        }

        let embeddings = self.client.embed_batch(&chunks, &self.model_name).await?;

        let points: Vec<EmbedVector> = embeddings.into_iter().enumerate().map(|(i, vector)| {
            EmbedVector {
                id: format!("{}:chunk:{}", id, i),
                vector,
                payload: EmbedPayload {
                    entity_type: "issue".to_string(),
                    entity_id: id.to_string(),
                    text: chunks[i].clone(),
                    extra: serde_json::json!({
                        "chunk_index": i,
                        "total_chunks": chunks.len(),
                    }).into(),
                },
            }
        }).collect();

        self.client.upsert(points).await
    }

    /// Batch-embed multiple conversation messages into per-room Qdrant collections.
    /// Auto-chunks long messages and filters non-text/system/empty content.
    /// Handles all filtering internally: only text-type, non-empty, non-system messages are embedded.
    pub async fn embed_memories_batch(
        &self,
        messages: Vec<EmbedMemoryInput>,
    ) -> crate::Result<()> {
        if messages.is_empty() {
            return Ok(());
        }

        // Group by room collection for batch upsert to reduce Qdrant calls
        use std::collections::HashMap;
        let mut by_room: HashMap<String, Vec<(EmbedMemoryInput, Vec<String>)>> = HashMap::new();

        for msg in messages {
            let chunks = chunk_text(&msg.content);
            if chunks.is_empty() || chunks.iter().all(|c| c.trim().is_empty()) {
                continue;
            }
            let collection = crate::embed::qdrant::QdrantClient::room_memory_collection_name(
                &msg.project_name, &msg.room_id,
            );
            by_room.entry(collection).or_default().push((msg, chunks));
        }

        for (collection, entries) in &by_room {
            // Collect all texts for batch embedding
            let all_texts: Vec<String> = entries.iter()
                .flat_map(|(_, chunks)| chunks.iter().cloned())
                .collect();

            if all_texts.is_empty() {
                continue;
            }

            let embeddings = self.client.embed_batch(&all_texts, &self.model_name).await?;

            // Ensure the room collection exists with correct dimensions
            if let Some((first, _)) = entries.first() {
                let _ = self.client
                    .ensure_room_memory_collection(&first.project_name, &first.room_id, self.dimensions)
                    .await;
            }

            // Build points: one per chunk
            let mut points = Vec::new();
            let mut embed_idx = 0;
            for (msg, chunks) in entries {
                for (chunk_i, chunk) in chunks.iter().enumerate() {
                    if embed_idx >= embeddings.len() {
                        break;
                    }
                    let point_id = if chunks.len() == 1 {
                        msg.message_id.clone()
                    } else {
                        format!("{}:chunk:{}", msg.message_id, chunk_i)
                    };
                    points.push(EmbedVector {
                        id: point_id,
                        vector: embeddings[embed_idx].clone(),
                        payload: EmbedPayload {
                            entity_type: "memory".to_string(),
                            entity_id: msg.room_id.clone(),
                            text: chunk.clone(),
                            extra: serde_json::json!({
                                "user_id": msg.user_id,
                                "sender_type": msg.sender_type,
                                "chunk_index": if chunks.len() > 1 { Some(chunk_i) } else { None },
                                "total_chunks": if chunks.len() > 1 { Some(chunks.len()) } else { None },
                            }).into(),
                        },
                    });
                    embed_idx += 1;
                }
            }

            if let Err(e) = self.client.upsert_to_collection(collection, points).await {
                tracing::warn!(collection = %collection, error = %e, "batch memory embed failed");
            }
        }

        Ok(())
    }

    /// Batch-embed repo tags with project isolation.
    /// Each tag stores project_id as entity_id for post-filtering.
    pub async fn embed_tags_batch(
        &self,
        tags: Vec<TagEmbedInput>,
    ) -> crate::Result<()> {
        if tags.is_empty() {
            return Ok(());
        }

        let texts: Vec<String> = tags
            .iter()
            .map(|t| {
                if let Some(ref desc) = t.description {
                    if !desc.is_empty() {
                        format!("{}: {}", t.name, desc)
                    } else {
                        t.name.clone()
                    }
                } else {
                    t.name.clone()
                }
            })
            .collect();

        let embeddings = self.client.embed_batch(&texts, &self.model_name).await?;

        let points: Vec<EmbedVector> = tags
            .into_iter()
            .zip(embeddings.into_iter())
            .map(|(tag, vector)| {
                let point_id = format!("{}:{}", tag.repo_id, tag.name);
                EmbedVector {
                    id: point_id,
                    vector,
                    payload: EmbedPayload {
                        entity_type: "repo_tag".to_string(),
                        entity_id: tag.project_id.clone(),
                        text: tag.name.clone(),
                        extra: serde_json::json!({
                            "repo_id": tag.repo_id,
                            "repo_name": tag.repo_name,
                            "tag_name": tag.name,
                            "description": tag.description,
                        })
                        .into(),
                    },
                }
            })
            .collect();

        self.client.upsert(points).await
    }

    /// Search repo tags by semantic similarity within a project.
    /// Filters by project_id (stored in entity_id) for project isolation.
    pub async fn search_tags(
        &self,
        query: &str,
        project_id: &str,
        limit: usize,
    ) -> crate::Result<Vec<SearchResult>> {
        let mut results = self
            .client
            .search(query, "repo_tag", &self.model_name, limit + 1)
            .await?;
        results.retain(|r| r.payload.entity_id == project_id);
        results.truncate(limit);
        Ok(results)
    }

    pub fn model_name(&self) -> &str {
        &self.model_name
    }

    pub fn dimensions(&self) -> u64 {
        self.dimensions
    }

    pub fn embed_client(&self) -> &EmbedClient {
        &self.client
    }

    /// Search skills by semantic similarity within a project.
    pub async fn search_skills(
        &self,
        query: &str,
        project_uuid: &str,
        limit: usize,
    ) -> crate::Result<Vec<SearchResult>> {
        self.client
            .search_skills(query, &self.model_name, project_uuid, limit)
            .await
    }

    /// Embed a conversation message into Qdrant as a memory vector.
    pub async fn embed_memory(
        &self,
        message_id: &str,
        text: &str,
        project_name: &str,
        room_id: &str,
        user_id: Option<&str>,
    ) -> crate::Result<()> {
        self.client
            .embed_memory(message_id, text, project_name, room_id, user_id, &self.model_name)
            .await
    }

    /// Search past conversation messages by semantic similarity within a room.
    pub async fn search_memories(
        &self,
        query: &str,
        project_name: &str,
        room_id: &str,
        limit: usize,
    ) -> crate::Result<Vec<SearchResult>> {
        self.client
            .search_memories(query, &self.model_name, project_name, room_id, limit, self.dimensions)
            .await
    }
}

/// Input struct for batch memory embedding into per-room Qdrant collections.
#[derive(Debug, Clone)]
pub struct EmbedMemoryInput {
    pub message_id: String,
    pub content: String,
    pub project_name: String,
    pub room_id: String,
    pub user_id: Option<String>,
    pub sender_type: String,
}

/// Input struct for batch tag embedding.
/// Re-exported from models for backward compatibility.
pub use models::TagEmbedInput;