use std::collections::HashMap; use super::chunk::chunk_text; use super::client::{EmbedPayload, EmbedVector}; use super::embeddable::{EmbedMemoryInput, Embeddable}; /// Embedding and upsert operations for entity vectors in Qdrant. impl super::EmbedService { pub async fn embed_issue( &self, id: &str, title: &str, body: Option<&str>, ) -> crate::Result<()> { let text = match body { Some(b) if !b.is_empty() => format!("{}\n\n{}", title, b), _ => title.to_string(), }; tracing::debug!(issue_id = %id, text_len = text.len(), "embed_issue: calling embedding API"); let vector = self.client.embed_text(&text, &self.model_name).await?; tracing::debug!(issue_id = %id, vec_dim = vector.len(), "embed_issue: embedding done"); let point = EmbedVector { id: id.to_string(), vector, payload: EmbedPayload { entity_type: "issue".to_string(), entity_id: id.to_string(), text, extra: None, }, }; self.client.upsert(vec![point]).await?; tracing::info!(issue_id = %id, "embed_issue: upsert complete"); Ok(()) } pub async fn embed_repo( &self, id: &str, name: &str, description: Option<&str>, ) -> crate::Result<()> { let text = match description { Some(d) if !d.is_empty() => format!("{}: {}", name, d), _ => name.to_string(), }; tracing::debug!(repo_id = %id, text_len = text.len(), "embed_repo: calling embedding API"); let vector = self.client.embed_text(&text, &self.model_name).await?; tracing::debug!(repo_id = %id, vec_dim = vector.len(), "embed_repo: embedding done"); let point = EmbedVector { id: id.to_string(), vector, payload: EmbedPayload { entity_type: "repo".to_string(), entity_id: id.to_string(), text, extra: None, }, }; self.client.upsert(vec![point]).await?; tracing::info!(repo_id = %id, "embed_repo: upsert complete"); Ok(()) } pub async fn embed_issues( &self, items: Vec, ) -> crate::Result<()> { if items.is_empty() { return Ok(()); } let texts: Vec = items.iter().map(|i| i.to_text()).collect(); tracing::debug!(count = texts.len(), "embed_issues: calling embed_batch"); let embeddings = self.client.embed_batch(&texts, &self.model_name).await?; tracing::debug!(count = embeddings.len(), "embed_issues: batch done"); let points: Vec = items .into_iter() .zip(embeddings.into_iter()) .map(|(item, vector)| EmbedVector { id: item.entity_id(), vector, payload: EmbedPayload { entity_type: item.entity_type().to_string(), entity_id: item.entity_id(), text: item.to_text(), extra: None, }, }) .collect(); let count = points.len(); self.client.upsert(points).await?; tracing::info!(count = count, "embed_issues: upsert complete"); Ok(()) } pub async fn embed_skill( &self, skill_id: i64, name: &str, description: Option<&str>, content: &str, project_uuid: &str, ) -> crate::Result<()> { let desc = description.unwrap_or_default(); let id = skill_id.to_string(); tracing::debug!(skill_id = %skill_id, name = %name, content_len = content.len(), "embed_skill: starting"); let texts = chunk_text(content); tracing::debug!(skill_id = %skill_id, chunks = texts.len(), "embed_skill: chunked"); if texts.len() == 1 { self.client .embed_skill(&id, name, desc, content, project_uuid, &self.model_name) .await?; } else { let full_texts: Vec = texts .iter() .map(|t| format!("{}: {} {}", name, desc, t)) .collect(); tracing::debug!(skill_id = %skill_id, "embed_skill: calling embed_batch"); let embeddings = self .client .embed_batch(&full_texts, &self.model_name) .await?; let points: Vec = embeddings .into_iter() .enumerate() .map(|(i, vector)| EmbedVector { id: format!("{}:chunk:{}", id, i), vector, payload: EmbedPayload { entity_type: "skill".to_string(), entity_id: project_uuid.to_string(), text: texts[i].clone(), extra: serde_json::json!({ "name": name, "description": desc, "chunk_index": i, "total_chunks": texts.len(), }) .into(), }, }) .collect(); self.client.upsert(points).await?; } tracing::info!(skill_id = %skill_id, chunks = texts.len(), "embed_skill: complete"); Ok(()) } pub async fn embed_issue_chunked( &self, id: &str, title: &str, body: Option<&str>, ) -> crate::Result<()> { let text = match body { Some(b) if !b.is_empty() => format!("{}\n\n{}", title, b), _ => title.to_string(), }; let chunks = chunk_text(&text); if chunks.len() == 1 { return self.embed_issue(id, title, body).await; } let embeddings = self.client.embed_batch(&chunks, &self.model_name).await?; let points: Vec = embeddings .into_iter() .enumerate() .map(|(i, vector)| EmbedVector { id: format!("{}:chunk:{}", id, i), vector, payload: EmbedPayload { entity_type: "issue".to_string(), entity_id: id.to_string(), text: chunks[i].clone(), extra: serde_json::json!({ "chunk_index": i, "total_chunks": chunks.len(), }) .into(), }, }) .collect(); self.client.upsert(points).await } pub async fn embed_memories_batch(&self, messages: Vec) -> crate::Result<()> { if messages.is_empty() { return Ok(()); } let mut by_room: HashMap)>> = HashMap::new(); for msg in messages { let chunks = chunk_text(&msg.content); if chunks.is_empty() || chunks.iter().all(|c| c.trim().is_empty()) { continue; } let collection = super::qdrant::QdrantClient::room_memory_collection_name( &msg.project_name, &msg.room_id, ); by_room.entry(collection).or_default().push((msg, chunks)); } for (collection, entries) in &by_room { let all_texts: Vec = entries .iter() .flat_map(|(_, chunks)| chunks.iter().cloned()) .collect(); if all_texts.is_empty() { continue; } let embeddings = self .client .embed_batch(&all_texts, &self.model_name) .await?; if let Some((first, _)) = entries.first() { let _ = self .client .ensure_room_memory_collection( &first.project_name, &first.room_id, self.dimensions, ) .await; } let mut points = Vec::new(); let mut embed_idx = 0; for (msg, chunks) in entries { for (chunk_i, chunk) in chunks.iter().enumerate() { if embed_idx >= embeddings.len() { break; } let point_id = if chunks.len() == 1 { msg.message_id.clone() } else { format!("{}:chunk:{}", msg.message_id, chunk_i) }; points.push(EmbedVector { id: point_id, vector: embeddings[embed_idx].clone(), payload: EmbedPayload { entity_type: "memory".to_string(), entity_id: msg.room_id.clone(), text: chunk.clone(), extra: serde_json::json!({ "message_id": msg.message_id, "seq": msg.seq, "user_id": msg.user_id, "sender_type": msg.sender_type, "chunk_index": if chunks.len() > 1 { Some(chunk_i) } else { None }, "total_chunks": if chunks.len() > 1 { Some(chunks.len()) } else { None }, }) .into(), }, }); embed_idx += 1; } } if let Err(e) = self.client.upsert_to_collection(collection, points).await { tracing::warn!(collection = %collection, error = %e, "batch memory embed failed"); } } Ok(()) } pub async fn embed_tags_batch( &self, tags: Vec, ) -> crate::Result<()> { if tags.is_empty() { return Ok(()); } let texts: Vec = tags .iter() .map(|t| { if let Some(ref desc) = t.description { if !desc.is_empty() { format!("{}: {}", t.name, desc) } else { t.name.clone() } } else { t.name.clone() } }) .collect(); let embeddings = self.client.embed_batch(&texts, &self.model_name).await?; let points: Vec = tags .into_iter() .zip(embeddings.into_iter()) .map(|(tag, vector)| { let point_id = format!("{}:{}", tag.repo_id, tag.name); EmbedVector { id: point_id, vector, payload: EmbedPayload { entity_type: "repo_tag".to_string(), entity_id: tag.project_id.clone(), text: tag.name.clone(), extra: serde_json::json!({ "repo_id": tag.repo_id, "repo_name": tag.repo_name, "tag_name": tag.name, "description": tag.description, }) .into(), }, } }) .collect(); self.client.upsert(points).await } pub async fn embed_memory( &self, message_id: &str, text: &str, project_name: &str, room_id: &str, user_id: Option<&str>, ) -> crate::Result<()> { self.client .embed_memory( message_id, text, project_name, room_id, user_id, &self.model_name, ) .await } }