pythonintermediate

Prompt Caching with OpenAI API

Reduce costs by up to 50% using OpenAI's automatic prompt caching for repeated context prefixes.

python
from openai import OpenAI

client = OpenAI()

LARGE_SYSTEM_PROMPT = 'You are an expert Python developer. ' * 500  # Large repeated context

def query_with_cache(question: str) -> dict:
    resp = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[
            {'role': 'system', 'content': LARGE_SYSTEM_PROMPT},
            {'role': 'user',   'content': question},
        ],
    )
    usage = resp.usage
    return {
        'answer':          resp.choices[0].message.content,
        'cached_tokens':   getattr(usage.prompt_tokens_details, 'cached_tokens', 0),
        'total_tokens':    usage.total_tokens,
    }

# First call — no cache hit
r1 = query_with_cache('What is a list comprehension?')
print(f'Q1 | cached={r1["cached_tokens"]} | total={r1["total_tokens"]}')

# Second call — cache hit on system prompt
r2 = query_with_cache('How do I use decorators?')
print(f'Q2 | cached={r2["cached_tokens"]} | total={r2["total_tokens"]}')

Use Cases

  • cost reduction
  • repeated context
  • prompt optimization

Tags

Related Snippets

Similar patterns you can reuse in the same workflow.