pythonbeginner

Stream LLM Chat Responses

Stream OpenAI chat completions token-by-token for real-time UI updates.

python
from openai import OpenAI

client = OpenAI()

def stream_chat(prompt: str, model: str = "gpt-4o"):
    """Stream a chat response token by token."""
    stream = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        stream=True
    )

    full_response = ""
    for chunk in stream:
        delta = chunk.choices[0].delta
        if delta.content:
            print(delta.content, end="", flush=True)
            full_response += delta.content

    print()  # newline after stream ends
    return full_response

# Usage
response = stream_chat("Explain quantum computing in 3 sentences.")
print(f"\nTotal length: {len(response)} chars")

# With system message and conversation history
def stream_conversation(messages: list):
    stream = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        stream=True,
        temperature=0.7,
        max_tokens=500
    )
    chunks = []
    for chunk in stream:
        if chunk.choices[0].delta.content:
            chunks.append(chunk.choices[0].delta.content)
    return "".join(chunks)

Use Cases

  • Chat UIs
  • Real-time responses
  • Typewriter effects

Tags

Related Snippets

Similar patterns you can reuse in the same workflow.