typescriptadvanced

Semantic Caching Layer for LLM Calls

Cache LLM responses by semantic similarity of prompts to reduce API costs and improve latency.

typescript
import OpenAI from 'openai';

const openai = new OpenAI();

interface CacheEntry {
  embedding: number[];
  prompt: string;
  response: string;
}

const cache: CacheEntry[] = [];
const SIMILARITY_THRESHOLD = 0.95;

function cosine(a: number[], b: number[]): number {
  let dot = 0, normA = 0, normB = 0;
  for (let i = 0; i < a.length; i++) {
    dot += a[i] * b[i];
    normA += a[i] ** 2;
    normB += b[i] ** 2;
  }
  return dot / (Math.sqrt(normA) * Math.sqrt(normB));
}

async function getEmbedding(text: string): Promise<number[]> {
  const res = await openai.embeddings.create({
    model: 'text-embedding-3-small',
    input: text,
  });
  return res.data[0].embedding;
}

export async function cachedCompletion(prompt: string): Promise<string> {
  const queryEmb = await getEmbedding(prompt);

  for (const entry of cache) {
    if (cosine(queryEmb, entry.embedding) >= SIMILARITY_THRESHOLD) {
      console.log('Cache hit');
      return entry.response;
    }
  }

  const res = await openai.chat.completions.create({
    model: 'gpt-4o',
    messages: [{ role: 'user', content: prompt }],
  });
  const response = res.choices[0].message.content ?? '';
  cache.push({ embedding: queryEmb, prompt, response });
  return response;
}

Use Cases

  • Reducing LLM API costs for repeated queries
  • Low-latency AI responses for common questions
  • Building cost-effective AI-powered features

Tags

Related Snippets

Similar patterns you can reuse in the same workflow.