pythonintermediate

Batch Embedding Large Text Corpora

Embed thousands of documents efficiently by batching requests to the OpenAI Embeddings API.

python
from openai import OpenAI
import numpy as np
from typing import Iterator

client = OpenAI()

def batch_iter(items: list, size: int) -> Iterator:
    for i in range(0, len(items), size):
        yield items[i:i+size]

def embed_corpus(texts: list[str], model: str = 'text-embedding-3-small', batch_size: int = 100) -> np.ndarray:
    all_embeddings = []
    for batch in batch_iter(texts, batch_size):
        resp = client.embeddings.create(input=batch, model=model)
        all_embeddings.extend([e.embedding for e in resp.data])
    return np.array(all_embeddings, dtype='float32')

corpus = [f'Document number {i} about topic {i % 10}' for i in range(500)]
embeddings = embed_corpus(corpus)
print(f'Shape: {embeddings.shape}')  # (500, 1536)

Use Cases

  • corpus embedding
  • semantic indexing
  • large-scale vectorization

Tags

Related Snippets

Similar patterns you can reuse in the same workflow.