gitdataai/libs/agent/embed/chunk.rs

64 lines
2.3 KiB
Rust

/// Maximum characters per chunk for embedding (approximates token limit).
/// text-embedding-3-small: 8192 token limit.
/// For CJK ~1 char/token, for English ~4 chars/token.
/// Conservative limit: 7000 chars to leave room for all languages.
const MAX_CHUNK_CHARS: usize = 7000;
/// Split long text into chunks at paragraph/sentence boundaries.
/// Returns at least one chunk even for empty text.
/// Safe for multi-byte characters (uses char indices, not byte indices).
pub fn chunk_text(text: &str) -> Vec<String> {
if text.is_empty() {
return vec![String::new()];
}
if text.len() <= MAX_CHUNK_CHARS {
return vec![text.to_string()];
}
let char_indices: Vec<usize> = text.char_indices().map(|(i, _)| i).collect();
let total_chars = char_indices.len();
let mut chunks = Vec::new();
let mut start_idx = 0;
while start_idx < total_chars {
let byte_start = char_indices[start_idx];
let end_char_idx = (start_idx + MAX_CHUNK_CHARS).min(total_chars);
let byte_end_candidate = char_indices[end_char_idx - 1]
+ text[char_indices[end_char_idx - 1]..]
.chars()
.next()
.map(|c| c.len_utf8())
.unwrap_or(1);
if end_char_idx >= total_chars {
chunks.push(text[byte_start..].to_string());
break;
}
let search_range = &text[byte_start..byte_end_candidate];
let break_at = search_range
.rfind("\n\n")
.map(|pos| pos + 2)
.or_else(|| search_range.rfind('\n').map(|pos| pos + 1))
.or_else(|| search_range.rfind(". ").map(|pos| pos + 1))
.or_else(|| search_range.rfind("! ").map(|pos| pos + 1))
.or_else(|| search_range.rfind("? ").map(|pos| pos + 1));
if let Some(offset) = break_at {
let byte_end = byte_start + offset;
chunks.push(text[byte_start..byte_end].to_string());
let mut advance = start_idx + 1;
while advance < total_chars && char_indices[advance] < byte_end {
advance += 1;
}
start_idx = advance;
} else {
chunks.push(text[byte_start..byte_end_candidate].to_string());
start_idx = end_char_idx;
}
}
chunks
}