pythonadvanced

vLLM High-Throughput LLM Serving

Serve open-source LLMs with high throughput using vLLM's PagedAttention for production use.

pythonPress ⌘/Ctrl + Shift + C to copy

# Start server: python -m vllm.entrypoints.openai.api_server --model meta-llama/Llama-3.1-8B-Instruct --dtype float16

from openai import OpenAI
import asyncio

client = OpenAI(base_url='http://localhost:8000/v1', api_key='not-needed')

def list_models() -> list[str]:
    return [m.id for m in client.models.list().data]

def generate(prompt: str, model: str | None = None, max_tokens: int = 200) -> str:
    models = list_models()
    model  = model or models[0]
    resp   = client.chat.completions.create(
        model=model,
        messages=[{'role':'user','content':prompt}],
        max_tokens=max_tokens,
        temperature=0.7,
    )
    return resp.choices[0].message.content

# Batch inference
prompts = ['Explain REST APIs.', 'What is a vector database?', 'Define gradient descent.']
for p in prompts:
    print(f'Q: {p}')
    print(f'A: {generate(p)[:100]}...')