pythonadvanced

Multimodal RAG with Images and Text

Build a multimodal RAG pipeline that retrieves and answers questions about image+text documents.

python
from openai import OpenAI
import base64
from pathlib import Path

client = OpenAI()

def encode_image(path: str) -> str:
    return base64.b64encode(Path(path).read_bytes()).decode()

def multimodal_qa(question: str, image_paths: list[str], text_context: str) -> str:
    content = [
        {'type': 'text', 'text': f'Context: {text_context}\n\nQuestion: {question}'},
    ]
    for img in image_paths:
        content.append({'type': 'image_url', 'image_url': {'url': f'data:image/png;base64,{encode_image(img)}'}})

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{'role': 'user', 'content': content}],
        max_tokens=500,
    )
    return response.choices[0].message.content

answer = multimodal_qa(
    question='What does the chart show about Q4 performance?',
    image_paths=['q4_chart.png'],
    text_context='These charts represent quarterly financial results for 2024.',
)
print(answer)

Use Cases

  • visual document Q&A
  • multimodal search
  • financial analysis

Tags

Related Snippets

Similar patterns you can reuse in the same workflow.