typescriptintermediate
Text Chunking Strategies for RAG
Implement different text chunking strategies for RAG pipelines — fixed, recursive, and semantic.
typescriptPress ⌘/Ctrl + Shift + C to copy
interface TextChunk {
text: string;
index: number;
metadata: Record<string, unknown>;
}
function fixedSizeChunk(text: string, chunkSize = 500, overlap = 50): TextChunk[] {
const chunks: TextChunk[] = [];
let start = 0;
let index = 0;
while (start < text.length) {
const end = Math.min(start + chunkSize, text.length);
chunks.push({
text: text.slice(start, end),
index: index++,
metadata: { start, end, strategy: 'fixed' },
});
start += chunkSize - overlap;
}
return chunks;
}
function recursiveChunk(
text: string,
maxSize = 500,
separators = ['\n\n', '\n', '. ', ' ']
): TextChunk[] {
const chunks: TextChunk[] = [];
let index = 0;
function split(text: string, seps: string[]): string[] {
if (text.length <= maxSize) return [text];
if (seps.length === 0) return [text.slice(0, maxSize)];
const [sep, ...rest] = seps;
const parts = text.split(sep);
const results: string[] = [];
let current = '';
for (const part of parts) {
if ((current + sep + part).length > maxSize && current) {
results.push(...split(current, rest));
current = part;
} else {
current = current ? current + sep + part : part;
}
}
if (current) results.push(...split(current, rest));
return results;
}
for (const chunk of split(text, separators)) {
if (chunk.trim()) {
chunks.push({ text: chunk.trim(), index: index++, metadata: { strategy: 'recursive' } });
}
}
return chunks;
}
function paragraphChunk(text: string, maxSize = 1000): TextChunk[] {
const paragraphs = text.split(/\n\n+/);
const chunks: TextChunk[] = [];
let current = '';
let index = 0;
for (const para of paragraphs) {
if ((current + '\n\n' + para).length > maxSize && current) {
chunks.push({ text: current.trim(), index: index++, metadata: {} });
current = para;
} else {
current = current ? current + '\n\n' + para : para;
}
}
if (current.trim()) chunks.push({ text: current.trim(), index: index++, metadata: {} });
return chunks;
}Use Cases
- RAG pipeline preprocessing
- document indexing
- embedding preparation
Tags
Related Snippets
Similar patterns you can reuse in the same workflow.
typescriptadvanced
RAG Pipeline Implementation
Build a retrieval-augmented generation pipeline that grounds LLM answers in your own documents.
Best for: Grounding LLM answers in private documents
#ai#rag
pythonadvanced
Build a RAG Pipeline with LangChain
Implement retrieval-augmented generation using LangChain, embeddings, and a vector store.
Best for: Knowledge base Q&A
#ai#langchain
pythonintermediate
ChromaDB Vector Database Operations
Store and query vector embeddings using ChromaDB for semantic search and RAG applications.
Best for: semantic search
#ai#chromadb
pythonadvanced
Semantic Chunking for RAG Documents
Split documents into semantically coherent chunks using embedding similarity for better RAG retrieval.
Best for: RAG optimization
#chunking#rag