pythonintermediate

LLM Retry with Model Fallback

Add resilient retry logic with exponential backoff and automatic model fallback for LLM calls.

python
import time
from openai import OpenAI, RateLimitError, APITimeoutError

client = OpenAI()

FALLBACK_MODELS = ['gpt-4o-mini', 'gpt-3.5-turbo']

def chat_with_fallback(prompt: str, max_retries: int = 3) -> str:
    for model in FALLBACK_MODELS:
        for attempt in range(max_retries):
            try:
                resp = client.chat.completions.create(
                    model=model,
                    messages=[{'role':'user','content':prompt}],
                    timeout=30,
                )
                return resp.choices[0].message.content
            except RateLimitError:
                wait = 2 ** attempt
                print(f'Rate limited on {model}, waiting {wait}s')
                time.sleep(wait)
            except APITimeoutError:
                print(f'Timeout on {model}, attempt {attempt+1}')
    raise RuntimeError('All models and retries exhausted')

print(chat_with_fallback('What is 2+2?'))

Use Cases

  • production resilience
  • rate limit handling
  • model fallback

Tags

Related Snippets

Similar patterns you can reuse in the same workflow.