gitdataai/libs/agent/embed/chunk.rs

/// Maximum characters per chunk for embedding (approximates token limit).
/// text-embedding-3-small: 8192 token limit.
/// For CJK ~1 char/token, for English ~4 chars/token.
/// Conservative limit: 7000 chars to leave room for all languages.
const MAX_CHUNK_CHARS: usize = 7000;

/// Split long text into chunks at paragraph/sentence boundaries.
/// Returns at least one chunk even for empty text.
/// Safe for multi-byte characters (uses char indices, not byte indices).
pub fn chunk_text(text: &str) -> Vec<String> {
    if text.is_empty() {
        return vec![String::new()];
    }
    if text.len() <= MAX_CHUNK_CHARS {
        return vec![text.to_string()];
    }

    let char_indices: Vec<usize> = text.char_indices().map(|(i, _)| i).collect();
    let total_chars = char_indices.len();

    let mut chunks = Vec::new();
    let mut start_idx = 0;

    while start_idx < total_chars {
        let byte_start = char_indices[start_idx];
        let end_char_idx = (start_idx + MAX_CHUNK_CHARS).min(total_chars);
        let byte_end_candidate = char_indices[end_char_idx - 1]
            + text[char_indices[end_char_idx - 1]..]
                .chars()
                .next()
                .map(|c| c.len_utf8())
                .unwrap_or(1);

        if end_char_idx >= total_chars {
            chunks.push(text[byte_start..].to_string());
            break;
        }

        let search_range = &text[byte_start..byte_end_candidate];
        let break_at = search_range
            .rfind("\n\n")
            .map(|pos| pos + 2)
            .or_else(|| search_range.rfind('\n').map(|pos| pos + 1))
            .or_else(|| search_range.rfind(". ").map(|pos| pos + 1))
            .or_else(|| search_range.rfind("! ").map(|pos| pos + 1))
            .or_else(|| search_range.rfind("? ").map(|pos| pos + 1));

        if let Some(offset) = break_at {
            let byte_end = byte_start + offset;
            chunks.push(text[byte_start..byte_end].to_string());
            let mut advance = start_idx + 1;
            while advance < total_chars && char_indices[advance] < byte_end {
                advance += 1;
            }
            start_idx = advance;
        } else {
            chunks.push(text[byte_start..byte_end_candidate].to_string());
            start_idx = end_char_idx;
        }
    }

    chunks
}