pythonadvanced

Semantic Chunking for RAG Documents

Split documents into semantically coherent chunks using embedding similarity for better RAG retrieval.

pythonPress ⌘/Ctrl + Shift + C to copy

from openai import OpenAI
import numpy as np

client = OpenAI()

def embed_sentences(sentences: list[str]) -> np.ndarray:
    resp = client.embeddings.create(input=sentences, model='text-embedding-3-small')
    return np.array([e.embedding for e in resp.data])

def semantic_chunk(text: str, threshold: float = 0.8) -> list[str]:
    sentences = [s.strip() for s in text.split('.') if s.strip()]
    if len(sentences) < 2:
        return [text]
    embs = embed_sentences(sentences)
    # Cosine similarity between adjacent sentences
    sims = [(embs[i] @ embs[i+1]) / (np.linalg.norm(embs[i]) * np.linalg.norm(embs[i+1])) for i in range(len(embs)-1)]
    chunks, current = [], [sentences[0]]
    for i, sim in enumerate(sims):
        if sim < threshold:
            chunks.append('. '.join(current) + '.')
            current = [sentences[i+1]]
        else:
            current.append(sentences[i+1])
    if current:
        chunks.append('. '.join(current) + '.')
    return chunks

text = 'Python is a programming language. It is used for data science. The weather is nice today. Machine learning is a form of AI. Neural networks learn from data.'
for chunk in semantic_chunk(text, threshold=0.7):
    print('CHUNK:', chunk)