pythonadvanced

Serverless GPU AI with Modal

Run GPU-accelerated ML inference serverlessly on Modal with automatic scaling and cold start optimization.

python
import modal

app = modal.App('ml-inference')
image = modal.Image.debian_slim().pip_install('torch', 'transformers')

@app.function(image=image, gpu='T4', timeout=120, container_idle_timeout=60)
def run_inference(texts: list[str]) -> list[str]:
    from transformers import pipeline
    classifier = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english', device=0)
    results = classifier(texts)
    return [f"{r['label']}: {r['score']:.3f}" for r in results]

@app.local_entrypoint()
def main():
    texts = ['I love this product!', 'This is terrible.', 'Not bad, could be better.']
    with modal.enable_output():
        results = run_inference.remote(texts)
    for text, result in zip(texts, results):
        print(f'{text!r} -> {result}')

Use Cases

  • serverless ML
  • GPU inference
  • scalable AI deployment

Tags

Related Snippets

Similar patterns you can reuse in the same workflow.