typescriptintermediate

Batch Embeddings Processing

Generate embeddings for large document sets in batches with rate limiting and progress tracking.

typescript
import OpenAI from 'openai';

const openai = new OpenAI();

interface EmbeddingResult {
  text: string;
  embedding: number[];
}

async function sleep(ms: number): Promise<void> {
  return new Promise((resolve) => setTimeout(resolve, ms));
}

export async function batchEmbed(
  texts: string[],
  batchSize = 100,
  delayMs = 200,
): Promise<EmbeddingResult[]> {
  const results: EmbeddingResult[] = [];

  for (let i = 0; i < texts.length; i += batchSize) {
    const batch = texts.slice(i, i + batchSize);
    console.log(`Processing batch ${Math.floor(i / batchSize) + 1}/${Math.ceil(texts.length / batchSize)}`);

    const res = await openai.embeddings.create({
      model: 'text-embedding-3-small',
      input: batch,
    });

    for (let j = 0; j < res.data.length; j++) {
      results.push({
        text: batch[j],
        embedding: res.data[j].embedding,
      });
    }

    if (i + batchSize < texts.length) {
      await sleep(delayMs);
    }
  }

  console.log(`Generated ${results.length} embeddings`);
  return results;
}

const docs = ['Document one', 'Document two', 'Document three'];
const embedded = await batchEmbed(docs);
console.log(embedded[0].embedding.length);

Use Cases

  • Indexing large document collections for search
  • Building vector databases from text corpora
  • Preprocessing data for recommendation systems

Tags

Related Snippets

Similar patterns you can reuse in the same workflow.