pythonintermediate

Batch Process Embeddings Efficiently

Process large datasets of embeddings with batching, caching, and rate limiting.

python
import time
import json
import hashlib
from pathlib import Path
from openai import OpenAI

client = OpenAI()

class EmbeddingCache:
    def __init__(self, cache_file: str = "embeddings_cache.json"):
        self.cache_file = Path(cache_file)
        self.cache = json.loads(self.cache_file.read_text()) if self.cache_file.exists() else {}

    def _key(self, text: str) -> str:
        return hashlib.sha256(text.encode()).hexdigest()[:16]

    def get(self, text: str):
        return self.cache.get(self._key(text))

    def set(self, text: str, embedding: list):
        self.cache[self._key(text)] = embedding

    def save(self):
        self.cache_file.write_text(json.dumps(self.cache))

def batch_embed(
    texts: list[str],
    model: str = "text-embedding-3-small",
    batch_size: int = 100,
    requests_per_min: int = 500
) -> list[list[float]]:
    cache = EmbeddingCache()
    results = [None] * len(texts)
    uncached = []

    # Check cache first
    for i, text in enumerate(texts):
        cached = cache.get(text)
        if cached:
            results[i] = cached
        else:
            uncached.append((i, text))

    print(f"Cache hits: {len(texts) - len(uncached)}/{len(texts)}")

    # Batch API calls for uncached
    delay = 60.0 / requests_per_min
    for batch_start in range(0, len(uncached), batch_size):
        batch = uncached[batch_start:batch_start + batch_size]
        response = client.embeddings.create(
            model=model,
            input=[text for _, text in batch]
        )
        for (idx, text), emb in zip(batch, response.data):
            results[idx] = emb.embedding
            cache.set(text, emb.embedding)
        time.sleep(delay)

    cache.save()
    return results

# Usage
texts = ["Hello world", "Machine learning", "Hello world"]  # duplicate uses cache
embeddings = batch_embed(texts)
print(f"Generated {len(embeddings)} embeddings")

Use Cases

  • Large-scale indexing
  • Search engine building
  • Document processing

Tags

Related Snippets

Similar patterns you can reuse in the same workflow.