use async_trait::async_trait; use qdrant_client::qdrant::Filter; use sea_orm::DatabaseConnection; use std::sync::Arc; use super::client::{EmbedClient, EmbedPayload, EmbedVector, SearchResult}; /// Maximum characters per chunk for embedding (approximates token limit). /// text-embedding-3-small: 8192 token limit. /// For CJK ~1 char/token, for English ~4 chars/token. /// Conservative limit: 7000 chars to leave room for all languages. const MAX_CHUNK_CHARS: usize = 7000; #[async_trait] pub trait Embeddable { fn entity_type(&self) -> &'static str; fn to_text(&self) -> String; fn entity_id(&self) -> String; } /// Split long text into chunks at paragraph/sentence boundaries. /// Returns at least one chunk even for empty text. /// Safe for multi-byte characters (uses char indices, not byte indices). fn chunk_text(text: &str) -> Vec { if text.is_empty() { return vec![String::new()]; } if text.len() <= MAX_CHUNK_CHARS { return vec![text.to_string()]; } // Collect char boundary byte positions let char_indices: Vec = text.char_indices().map(|(i, _)| i).collect(); let total_chars = char_indices.len(); let mut chunks = Vec::new(); let mut start_idx = 0; // char index while start_idx < total_chars { // Start byte offset let byte_start = char_indices[start_idx]; // Find end char index: at most MAX_CHUNK_CHARS characters let end_char_idx = (start_idx + MAX_CHUNK_CHARS).min(total_chars); let byte_end_candidate = char_indices[end_char_idx - 1] + text[char_indices[end_char_idx - 1]..].chars().next().map(|c| c.len_utf8()).unwrap_or(1); if end_char_idx >= total_chars { chunks.push(text[byte_start..].to_string()); break; } // Try to break at paragraph or sentence boundary in the allowed range let search_range = &text[byte_start..byte_end_candidate]; let break_at = if let Some(pos) = search_range.rfind("\n\n") { Some(pos + 2) // after the paragraph break } else if let Some(pos) = search_range.rfind('\n') { Some(pos + 1) } else if let Some(pos) = search_range.rfind(". ") { Some(pos + 1) } else if let Some(pos) = search_range.rfind("! ") { Some(pos + 1) } else if let Some(pos) = search_range.rfind("? ") { Some(pos + 1) } else { None }; if let Some(offset) = break_at { let byte_end = byte_start + offset; chunks.push(text[byte_start..byte_end].to_string()); // Advance char index to match the byte break let mut advance = start_idx + 1; while advance < total_chars && char_indices[advance] < byte_end { advance += 1; } start_idx = advance; } else { // Hard break at char boundary chunks.push(text[byte_start..byte_end_candidate].to_string()); start_idx = end_char_idx; } } chunks } #[derive(Clone)] pub struct EmbedService { client: Arc, db: DatabaseConnection, model_name: String, dimensions: u64, } impl EmbedService { pub fn new( client: EmbedClient, db: DatabaseConnection, model_name: String, dimensions: u64, ) -> Self { Self { client: Arc::new(client), db, model_name, dimensions, } } pub async fn embed_issue( &self, id: &str, title: &str, body: Option<&str>, ) -> crate::Result<()> { let text = match body { Some(b) if !b.is_empty() => format!("{}\n\n{}", title, b), _ => title.to_string(), }; tracing::debug!(issue_id = %id, text_len = text.len(), "embed_issue: calling embedding API"); let vector = self.client.embed_text(&text, &self.model_name).await?; tracing::debug!(issue_id = %id, vec_dim = vector.len(), "embed_issue: embedding done"); let point = EmbedVector { id: id.to_string(), vector, payload: EmbedPayload { entity_type: "issue".to_string(), entity_id: id.to_string(), text, extra: None, }, }; self.client.upsert(vec![point]).await?; tracing::info!(issue_id = %id, "embed_issue: upsert complete"); Ok(()) } pub async fn embed_repo( &self, id: &str, name: &str, description: Option<&str>, ) -> crate::Result<()> { let text = match description { Some(d) if !d.is_empty() => format!("{}: {}", name, d), _ => name.to_string(), }; tracing::debug!(repo_id = %id, text_len = text.len(), "embed_repo: calling embedding API"); let vector = self.client.embed_text(&text, &self.model_name).await?; tracing::debug!(repo_id = %id, vec_dim = vector.len(), "embed_repo: embedding done"); let point = EmbedVector { id: id.to_string(), vector, payload: EmbedPayload { entity_type: "repo".to_string(), entity_id: id.to_string(), text, extra: None, }, }; self.client.upsert(vec![point]).await?; tracing::info!(repo_id = %id, "embed_repo: upsert complete"); Ok(()) } pub async fn embed_issues( &self, items: Vec, ) -> crate::Result<()> { if items.is_empty() { return Ok(()); } let texts: Vec = items.iter().map(|i| i.to_text()).collect(); tracing::debug!(count = texts.len(), "embed_issues: calling embed_batch"); let embeddings = self.client.embed_batch(&texts, &self.model_name).await?; tracing::debug!(count = embeddings.len(), "embed_issues: batch done"); let points: Vec = items .into_iter() .zip(embeddings.into_iter()) .map(|(item, vector)| EmbedVector { id: item.entity_id(), vector, payload: EmbedPayload { entity_type: item.entity_type().to_string(), entity_id: item.entity_id(), text: item.to_text(), extra: None, }, }) .collect(); let count = points.len(); self.client.upsert(points).await?; tracing::info!(count = count, "embed_issues: upsert complete"); Ok(()) } pub async fn search_issues( &self, query: &str, limit: usize, ) -> crate::Result> { self.client .search(query, "issue", &self.model_name, limit) .await } pub async fn search_repos( &self, query: &str, limit: usize, ) -> crate::Result> { self.client .search(query, "repo", &self.model_name, limit) .await } pub async fn search_issues_filtered( &self, query: &str, limit: usize, filter: Filter, ) -> crate::Result> { self.client .search_with_filter(query, "issue", &self.model_name, limit, filter) .await } pub async fn delete_issue_embedding(&self, issue_id: &str) -> crate::Result<()> { self.client.delete_by_entity_id("issue", issue_id).await } pub async fn delete_repo_embedding(&self, repo_id: &str) -> crate::Result<()> { self.client.delete_by_entity_id("repo", repo_id).await } pub async fn ensure_collections(&self) -> crate::Result<()> { self.client .ensure_collection("issue", self.dimensions) .await?; self.client .ensure_collection("repo", self.dimensions) .await?; self.client.ensure_skill_collection(self.dimensions).await?; self.client .ensure_collection("repo_tag", self.dimensions) .await?; // Room memory collections are created per-room on first embed Ok(()) } pub fn db(&self) -> &DatabaseConnection { &self.db } pub fn client(&self) -> &Arc { &self.client } /// Embed a project skill into Qdrant for vector-based semantic search. pub async fn embed_skill( &self, skill_id: i64, name: &str, description: Option<&str>, content: &str, project_uuid: &str, ) -> crate::Result<()> { let desc = description.unwrap_or_default(); let id = skill_id.to_string(); tracing::debug!(skill_id = %skill_id, name = %name, content_len = content.len(), "embed_skill: starting"); // Auto-chunk long content let texts = chunk_text(content); tracing::debug!(skill_id = %skill_id, chunks = texts.len(), "embed_skill: chunked"); if texts.len() == 1 { self.client .embed_skill(&id, name, desc, content, project_uuid, &self.model_name) .await?; } else { // Multi-chunk: embed each chunk with chunk_index metadata let full_texts: Vec = texts.iter().map(|t| format!("{}: {} {}", name, desc, t)).collect(); tracing::debug!(skill_id = %skill_id, "embed_skill: calling embed_batch"); let embeddings = self.client.embed_batch(&full_texts, &self.model_name).await?; let points: Vec = embeddings.into_iter().enumerate().map(|(i, vector)| { EmbedVector { id: format!("{}:chunk:{}", id, i), vector, payload: EmbedPayload { entity_type: "skill".to_string(), entity_id: project_uuid.to_string(), text: texts[i].clone(), extra: serde_json::json!({ "name": name, "description": desc, "chunk_index": i, "total_chunks": texts.len(), }).into(), }, } }).collect(); self.client.upsert(points).await?; } tracing::info!(skill_id = %skill_id, chunks = texts.len(), "embed_skill: complete"); Ok(()) } /// Embed an issue with auto-chunking for long content. pub async fn embed_issue_chunked( &self, id: &str, title: &str, body: Option<&str>, ) -> crate::Result<()> { let text = match body { Some(b) if !b.is_empty() => format!("{}\n\n{}", title, b), _ => title.to_string(), }; let chunks = chunk_text(&text); if chunks.len() == 1 { return self.embed_issue(id, title, body).await; } let embeddings = self.client.embed_batch(&chunks, &self.model_name).await?; let points: Vec = embeddings.into_iter().enumerate().map(|(i, vector)| { EmbedVector { id: format!("{}:chunk:{}", id, i), vector, payload: EmbedPayload { entity_type: "issue".to_string(), entity_id: id.to_string(), text: chunks[i].clone(), extra: serde_json::json!({ "chunk_index": i, "total_chunks": chunks.len(), }).into(), }, } }).collect(); self.client.upsert(points).await } /// Batch-embed multiple conversation messages into per-room Qdrant collections. /// Auto-chunks long messages and filters non-text/system/empty content. /// Handles all filtering internally: only text-type, non-empty, non-system messages are embedded. pub async fn embed_memories_batch( &self, messages: Vec, ) -> crate::Result<()> { if messages.is_empty() { return Ok(()); } // Group by room collection for batch upsert to reduce Qdrant calls use std::collections::HashMap; let mut by_room: HashMap)>> = HashMap::new(); for msg in messages { let chunks = chunk_text(&msg.content); if chunks.is_empty() || chunks.iter().all(|c| c.trim().is_empty()) { continue; } let collection = crate::embed::qdrant::QdrantClient::room_memory_collection_name( &msg.project_name, &msg.room_id, ); by_room.entry(collection).or_default().push((msg, chunks)); } for (collection, entries) in &by_room { // Collect all texts for batch embedding let all_texts: Vec = entries.iter() .flat_map(|(_, chunks)| chunks.iter().cloned()) .collect(); if all_texts.is_empty() { continue; } let embeddings = self.client.embed_batch(&all_texts, &self.model_name).await?; // Ensure the room collection exists with correct dimensions if let Some((first, _)) = entries.first() { let _ = self.client .ensure_room_memory_collection(&first.project_name, &first.room_id, self.dimensions) .await; } // Build points: one per chunk let mut points = Vec::new(); let mut embed_idx = 0; for (msg, chunks) in entries { for (chunk_i, chunk) in chunks.iter().enumerate() { if embed_idx >= embeddings.len() { break; } let point_id = if chunks.len() == 1 { msg.message_id.clone() } else { format!("{}:chunk:{}", msg.message_id, chunk_i) }; points.push(EmbedVector { id: point_id, vector: embeddings[embed_idx].clone(), payload: EmbedPayload { entity_type: "memory".to_string(), entity_id: msg.room_id.clone(), text: chunk.clone(), extra: serde_json::json!({ "user_id": msg.user_id, "sender_type": msg.sender_type, "chunk_index": if chunks.len() > 1 { Some(chunk_i) } else { None }, "total_chunks": if chunks.len() > 1 { Some(chunks.len()) } else { None }, }).into(), }, }); embed_idx += 1; } } if let Err(e) = self.client.upsert_to_collection(collection, points).await { tracing::warn!(collection = %collection, error = %e, "batch memory embed failed"); } } Ok(()) } /// Batch-embed repo tags with project isolation. /// Each tag stores project_id as entity_id for post-filtering. pub async fn embed_tags_batch( &self, tags: Vec, ) -> crate::Result<()> { if tags.is_empty() { return Ok(()); } let texts: Vec = tags .iter() .map(|t| { if let Some(ref desc) = t.description { if !desc.is_empty() { format!("{}: {}", t.name, desc) } else { t.name.clone() } } else { t.name.clone() } }) .collect(); let embeddings = self.client.embed_batch(&texts, &self.model_name).await?; let points: Vec = tags .into_iter() .zip(embeddings.into_iter()) .map(|(tag, vector)| { let point_id = format!("{}:{}", tag.repo_id, tag.name); EmbedVector { id: point_id, vector, payload: EmbedPayload { entity_type: "repo_tag".to_string(), entity_id: tag.project_id.clone(), text: tag.name.clone(), extra: serde_json::json!({ "repo_id": tag.repo_id, "repo_name": tag.repo_name, "tag_name": tag.name, "description": tag.description, }) .into(), }, } }) .collect(); self.client.upsert(points).await } /// Search repo tags by semantic similarity within a project. /// Filters by project_id (stored in entity_id) for project isolation. pub async fn search_tags( &self, query: &str, project_id: &str, limit: usize, ) -> crate::Result> { let mut results = self .client .search(query, "repo_tag", &self.model_name, limit + 1) .await?; results.retain(|r| r.payload.entity_id == project_id); results.truncate(limit); Ok(results) } pub fn model_name(&self) -> &str { &self.model_name } pub fn dimensions(&self) -> u64 { self.dimensions } pub fn embed_client(&self) -> &EmbedClient { &self.client } /// Search skills by semantic similarity within a project. pub async fn search_skills( &self, query: &str, project_uuid: &str, limit: usize, ) -> crate::Result> { self.client .search_skills(query, &self.model_name, project_uuid, limit) .await } /// Embed a conversation message into Qdrant as a memory vector. pub async fn embed_memory( &self, message_id: &str, text: &str, project_name: &str, room_id: &str, user_id: Option<&str>, ) -> crate::Result<()> { self.client .embed_memory(message_id, text, project_name, room_id, user_id, &self.model_name) .await } /// Search past conversation messages by semantic similarity within a room. pub async fn search_memories( &self, query: &str, project_name: &str, room_id: &str, limit: usize, ) -> crate::Result> { self.client .search_memories(query, &self.model_name, project_name, room_id, limit, self.dimensions) .await } } /// Input struct for batch memory embedding into per-room Qdrant collections. #[derive(Debug, Clone)] pub struct EmbedMemoryInput { pub message_id: String, pub content: String, pub project_name: String, pub room_id: String, pub user_id: Option, pub sender_type: String, } /// Input struct for batch tag embedding. #[derive(Debug, Clone)] pub struct TagEmbedInput { pub repo_id: String, pub repo_name: String, pub project_id: String, pub name: String, pub description: Option, }