64 lines
2.3 KiB
Rust
64 lines
2.3 KiB
Rust
/// Maximum characters per chunk for embedding (approximates token limit).
|
|
/// text-embedding-3-small: 8192 token limit.
|
|
/// For CJK ~1 char/token, for English ~4 chars/token.
|
|
/// Conservative limit: 7000 chars to leave room for all languages.
|
|
const MAX_CHUNK_CHARS: usize = 7000;
|
|
|
|
/// Split long text into chunks at paragraph/sentence boundaries.
|
|
/// Returns at least one chunk even for empty text.
|
|
/// Safe for multi-byte characters (uses char indices, not byte indices).
|
|
pub fn chunk_text(text: &str) -> Vec<String> {
|
|
if text.is_empty() {
|
|
return vec![String::new()];
|
|
}
|
|
if text.len() <= MAX_CHUNK_CHARS {
|
|
return vec![text.to_string()];
|
|
}
|
|
|
|
let char_indices: Vec<usize> = text.char_indices().map(|(i, _)| i).collect();
|
|
let total_chars = char_indices.len();
|
|
|
|
let mut chunks = Vec::new();
|
|
let mut start_idx = 0;
|
|
|
|
while start_idx < total_chars {
|
|
let byte_start = char_indices[start_idx];
|
|
let end_char_idx = (start_idx + MAX_CHUNK_CHARS).min(total_chars);
|
|
let byte_end_candidate = char_indices[end_char_idx - 1]
|
|
+ text[char_indices[end_char_idx - 1]..]
|
|
.chars()
|
|
.next()
|
|
.map(|c| c.len_utf8())
|
|
.unwrap_or(1);
|
|
|
|
if end_char_idx >= total_chars {
|
|
chunks.push(text[byte_start..].to_string());
|
|
break;
|
|
}
|
|
|
|
let search_range = &text[byte_start..byte_end_candidate];
|
|
let break_at = search_range
|
|
.rfind("\n\n")
|
|
.map(|pos| pos + 2)
|
|
.or_else(|| search_range.rfind('\n').map(|pos| pos + 1))
|
|
.or_else(|| search_range.rfind(". ").map(|pos| pos + 1))
|
|
.or_else(|| search_range.rfind("! ").map(|pos| pos + 1))
|
|
.or_else(|| search_range.rfind("? ").map(|pos| pos + 1));
|
|
|
|
if let Some(offset) = break_at {
|
|
let byte_end = byte_start + offset;
|
|
chunks.push(text[byte_start..byte_end].to_string());
|
|
let mut advance = start_idx + 1;
|
|
while advance < total_chars && char_indices[advance] < byte_end {
|
|
advance += 1;
|
|
}
|
|
start_idx = advance;
|
|
} else {
|
|
chunks.push(text[byte_start..byte_end_candidate].to_string());
|
|
start_idx = end_char_idx;
|
|
}
|
|
}
|
|
|
|
chunks
|
|
}
|