/// Maximum characters per chunk for embedding (approximates token limit). /// text-embedding-3-small: 8192 token limit. /// For CJK ~1 char/token, for English ~4 chars/token. /// Conservative limit: 7000 chars to leave room for all languages. const MAX_CHUNK_CHARS: usize = 7000; /// Split long text into chunks at paragraph/sentence boundaries. /// Returns at least one chunk even for empty text. /// Safe for multi-byte characters (uses char indices, not byte indices). pub fn chunk_text(text: &str) -> Vec { if text.is_empty() { return vec![String::new()]; } if text.len() <= MAX_CHUNK_CHARS { return vec![text.to_string()]; } let char_indices: Vec = text.char_indices().map(|(i, _)| i).collect(); let total_chars = char_indices.len(); let mut chunks = Vec::new(); let mut start_idx = 0; while start_idx < total_chars { let byte_start = char_indices[start_idx]; let end_char_idx = (start_idx + MAX_CHUNK_CHARS).min(total_chars); let byte_end_candidate = char_indices[end_char_idx - 1] + text[char_indices[end_char_idx - 1]..] .chars() .next() .map(|c| c.len_utf8()) .unwrap_or(1); if end_char_idx >= total_chars { chunks.push(text[byte_start..].to_string()); break; } let search_range = &text[byte_start..byte_end_candidate]; let break_at = search_range .rfind("\n\n") .map(|pos| pos + 2) .or_else(|| search_range.rfind('\n').map(|pos| pos + 1)) .or_else(|| search_range.rfind(". ").map(|pos| pos + 1)) .or_else(|| search_range.rfind("! ").map(|pos| pos + 1)) .or_else(|| search_range.rfind("? ").map(|pos| pos + 1)); if let Some(offset) = break_at { let byte_end = byte_start + offset; chunks.push(text[byte_start..byte_end].to_string()); let mut advance = start_idx + 1; while advance < total_chars && char_indices[advance] < byte_end { advance += 1; } start_idx = advance; } else { chunks.push(text[byte_start..byte_end_candidate].to_string()); start_idx = end_char_idx; } } chunks }