typescriptintermediate

Text Chunking Strategies for RAG

Implement different text chunking strategies for RAG pipelines — fixed, recursive, and semantic.

typescript
interface TextChunk {
  text: string;
  index: number;
  metadata: Record<string, unknown>;
}

function fixedSizeChunk(text: string, chunkSize = 500, overlap = 50): TextChunk[] {
  const chunks: TextChunk[] = [];
  let start = 0;
  let index = 0;
  while (start < text.length) {
    const end = Math.min(start + chunkSize, text.length);
    chunks.push({
      text: text.slice(start, end),
      index: index++,
      metadata: { start, end, strategy: 'fixed' },
    });
    start += chunkSize - overlap;
  }
  return chunks;
}

function recursiveChunk(
  text: string,
  maxSize = 500,
  separators = ['\n\n', '\n', '. ', ' ']
): TextChunk[] {
  const chunks: TextChunk[] = [];
  let index = 0;

  function split(text: string, seps: string[]): string[] {
    if (text.length <= maxSize) return [text];
    if (seps.length === 0) return [text.slice(0, maxSize)];
    const [sep, ...rest] = seps;
    const parts = text.split(sep);
    const results: string[] = [];
    let current = '';
    for (const part of parts) {
      if ((current + sep + part).length > maxSize && current) {
        results.push(...split(current, rest));
        current = part;
      } else {
        current = current ? current + sep + part : part;
      }
    }
    if (current) results.push(...split(current, rest));
    return results;
  }

  for (const chunk of split(text, separators)) {
    if (chunk.trim()) {
      chunks.push({ text: chunk.trim(), index: index++, metadata: { strategy: 'recursive' } });
    }
  }
  return chunks;
}

function paragraphChunk(text: string, maxSize = 1000): TextChunk[] {
  const paragraphs = text.split(/\n\n+/);
  const chunks: TextChunk[] = [];
  let current = '';
  let index = 0;
  for (const para of paragraphs) {
    if ((current + '\n\n' + para).length > maxSize && current) {
      chunks.push({ text: current.trim(), index: index++, metadata: {} });
      current = para;
    } else {
      current = current ? current + '\n\n' + para : para;
    }
  }
  if (current.trim()) chunks.push({ text: current.trim(), index: index++, metadata: {} });
  return chunks;
}

Use Cases

  • RAG pipeline preprocessing
  • document indexing
  • embedding preparation

Tags

Related Snippets

Similar patterns you can reuse in the same workflow.