gitdataai/libs/agent/embed/entity_embed.rs
ZhenYi d45e9e28f4 refactor(agent): split monolithic service files into specialized modules
Extract agent, compact, embed, task, and modes modules from single
service.rs files into focused sub-modules. Add orao module for
O1-like reasoning loop. Move RigAgentService to rig_tool.rs.
2026-05-11 17:04:57 +08:00

351 lines
12 KiB
Rust

use std::collections::HashMap;
use super::chunk::chunk_text;
use super::client::{EmbedPayload, EmbedVector};
use super::embeddable::{EmbedMemoryInput, Embeddable};
/// Embedding and upsert operations for entity vectors in Qdrant.
impl super::EmbedService {
pub async fn embed_issue(
&self,
id: &str,
title: &str,
body: Option<&str>,
) -> crate::Result<()> {
let text = match body {
Some(b) if !b.is_empty() => format!("{}\n\n{}", title, b),
_ => title.to_string(),
};
tracing::debug!(issue_id = %id, text_len = text.len(), "embed_issue: calling embedding API");
let vector = self.client.embed_text(&text, &self.model_name).await?;
tracing::debug!(issue_id = %id, vec_dim = vector.len(), "embed_issue: embedding done");
let point = EmbedVector {
id: id.to_string(),
vector,
payload: EmbedPayload {
entity_type: "issue".to_string(),
entity_id: id.to_string(),
text,
extra: None,
},
};
self.client.upsert(vec![point]).await?;
tracing::info!(issue_id = %id, "embed_issue: upsert complete");
Ok(())
}
pub async fn embed_repo(
&self,
id: &str,
name: &str,
description: Option<&str>,
) -> crate::Result<()> {
let text = match description {
Some(d) if !d.is_empty() => format!("{}: {}", name, d),
_ => name.to_string(),
};
tracing::debug!(repo_id = %id, text_len = text.len(), "embed_repo: calling embedding API");
let vector = self.client.embed_text(&text, &self.model_name).await?;
tracing::debug!(repo_id = %id, vec_dim = vector.len(), "embed_repo: embedding done");
let point = EmbedVector {
id: id.to_string(),
vector,
payload: EmbedPayload {
entity_type: "repo".to_string(),
entity_id: id.to_string(),
text,
extra: None,
},
};
self.client.upsert(vec![point]).await?;
tracing::info!(repo_id = %id, "embed_repo: upsert complete");
Ok(())
}
pub async fn embed_issues<T: Embeddable + Send + Sync>(
&self,
items: Vec<T>,
) -> crate::Result<()> {
if items.is_empty() {
return Ok(());
}
let texts: Vec<String> = items.iter().map(|i| i.to_text()).collect();
tracing::debug!(count = texts.len(), "embed_issues: calling embed_batch");
let embeddings = self.client.embed_batch(&texts, &self.model_name).await?;
tracing::debug!(count = embeddings.len(), "embed_issues: batch done");
let points: Vec<EmbedVector> = items
.into_iter()
.zip(embeddings.into_iter())
.map(|(item, vector)| EmbedVector {
id: item.entity_id(),
vector,
payload: EmbedPayload {
entity_type: item.entity_type().to_string(),
entity_id: item.entity_id(),
text: item.to_text(),
extra: None,
},
})
.collect();
let count = points.len();
self.client.upsert(points).await?;
tracing::info!(count = count, "embed_issues: upsert complete");
Ok(())
}
pub async fn embed_skill(
&self,
skill_id: i64,
name: &str,
description: Option<&str>,
content: &str,
project_uuid: &str,
) -> crate::Result<()> {
let desc = description.unwrap_or_default();
let id = skill_id.to_string();
tracing::debug!(skill_id = %skill_id, name = %name, content_len = content.len(), "embed_skill: starting");
let texts = chunk_text(content);
tracing::debug!(skill_id = %skill_id, chunks = texts.len(), "embed_skill: chunked");
if texts.len() == 1 {
self.client
.embed_skill(&id, name, desc, content, project_uuid, &self.model_name)
.await?;
} else {
let full_texts: Vec<String> = texts
.iter()
.map(|t| format!("{}: {} {}", name, desc, t))
.collect();
tracing::debug!(skill_id = %skill_id, "embed_skill: calling embed_batch");
let embeddings = self.client.embed_batch(&full_texts, &self.model_name).await?;
let points: Vec<EmbedVector> = embeddings
.into_iter()
.enumerate()
.map(|(i, vector)| EmbedVector {
id: format!("{}:chunk:{}", id, i),
vector,
payload: EmbedPayload {
entity_type: "skill".to_string(),
entity_id: project_uuid.to_string(),
text: texts[i].clone(),
extra: serde_json::json!({
"name": name,
"description": desc,
"chunk_index": i,
"total_chunks": texts.len(),
})
.into(),
},
})
.collect();
self.client.upsert(points).await?;
}
tracing::info!(skill_id = %skill_id, chunks = texts.len(), "embed_skill: complete");
Ok(())
}
pub async fn embed_issue_chunked(
&self,
id: &str,
title: &str,
body: Option<&str>,
) -> crate::Result<()> {
let text = match body {
Some(b) if !b.is_empty() => format!("{}\n\n{}", title, b),
_ => title.to_string(),
};
let chunks = chunk_text(&text);
if chunks.len() == 1 {
return self.embed_issue(id, title, body).await;
}
let embeddings = self.client.embed_batch(&chunks, &self.model_name).await?;
let points: Vec<EmbedVector> = embeddings
.into_iter()
.enumerate()
.map(|(i, vector)| EmbedVector {
id: format!("{}:chunk:{}", id, i),
vector,
payload: EmbedPayload {
entity_type: "issue".to_string(),
entity_id: id.to_string(),
text: chunks[i].clone(),
extra: serde_json::json!({
"chunk_index": i,
"total_chunks": chunks.len(),
})
.into(),
},
})
.collect();
self.client.upsert(points).await
}
pub async fn embed_memories_batch(
&self,
messages: Vec<EmbedMemoryInput>,
) -> crate::Result<()> {
if messages.is_empty() {
return Ok(());
}
let mut by_room: HashMap<String, Vec<(EmbedMemoryInput, Vec<String>)>> = HashMap::new();
for msg in messages {
let chunks = chunk_text(&msg.content);
if chunks.is_empty() || chunks.iter().all(|c| c.trim().is_empty()) {
continue;
}
let collection = super::qdrant::QdrantClient::room_memory_collection_name(
&msg.project_name, &msg.room_id,
);
by_room.entry(collection).or_default().push((msg, chunks));
}
for (collection, entries) in &by_room {
let all_texts: Vec<String> = entries
.iter()
.flat_map(|(_, chunks)| chunks.iter().cloned())
.collect();
if all_texts.is_empty() {
continue;
}
let embeddings = self.client.embed_batch(&all_texts, &self.model_name).await?;
if let Some((first, _)) = entries.first() {
let _ = self.client
.ensure_room_memory_collection(&first.project_name, &first.room_id, self.dimensions)
.await;
}
let mut points = Vec::new();
let mut embed_idx = 0;
for (msg, chunks) in entries {
for (chunk_i, chunk) in chunks.iter().enumerate() {
if embed_idx >= embeddings.len() {
break;
}
let point_id = if chunks.len() == 1 {
msg.message_id.clone()
} else {
format!("{}:chunk:{}", msg.message_id, chunk_i)
};
points.push(EmbedVector {
id: point_id,
vector: embeddings[embed_idx].clone(),
payload: EmbedPayload {
entity_type: "memory".to_string(),
entity_id: msg.room_id.clone(),
text: chunk.clone(),
extra: serde_json::json!({
"user_id": msg.user_id,
"sender_type": msg.sender_type,
"chunk_index": if chunks.len() > 1 {
Some(chunk_i)
} else {
None
},
"total_chunks": if chunks.len() > 1 {
Some(chunks.len())
} else {
None
},
})
.into(),
},
});
embed_idx += 1;
}
}
if let Err(e) = self.client.upsert_to_collection(collection, points).await {
tracing::warn!(collection = %collection, error = %e, "batch memory embed failed");
}
}
Ok(())
}
pub async fn embed_tags_batch(
&self,
tags: Vec<super::embeddable::TagEmbedInput>,
) -> crate::Result<()> {
if tags.is_empty() {
return Ok(());
}
let texts: Vec<String> = tags
.iter()
.map(|t| {
if let Some(ref desc) = t.description {
if !desc.is_empty() {
format!("{}: {}", t.name, desc)
} else {
t.name.clone()
}
} else {
t.name.clone()
}
})
.collect();
let embeddings = self.client.embed_batch(&texts, &self.model_name).await?;
let points: Vec<EmbedVector> = tags
.into_iter()
.zip(embeddings.into_iter())
.map(|(tag, vector)| {
let point_id = format!("{}:{}", tag.repo_id, tag.name);
EmbedVector {
id: point_id,
vector,
payload: EmbedPayload {
entity_type: "repo_tag".to_string(),
entity_id: tag.project_id.clone(),
text: tag.name.clone(),
extra: serde_json::json!({
"repo_id": tag.repo_id,
"repo_name": tag.repo_name,
"tag_name": tag.name,
"description": tag.description,
})
.into(),
},
}
})
.collect();
self.client.upsert(points).await
}
pub async fn embed_memory(
&self,
message_id: &str,
text: &str,
project_name: &str,
room_id: &str,
user_id: Option<&str>,
) -> crate::Result<()> {
self.client
.embed_memory(message_id, text, project_name, room_id, user_id, &self.model_name)
.await
}
}