pythonintermediate
Analyze Images with GPT Vision API
Send images to GPT-4o for description, analysis, and visual Q&A.
pythonPress ⌘/Ctrl + Shift + C to copy
from openai import OpenAI
import base64
from pathlib import Path
client = OpenAI()
def encode_image(image_path: str) -> str:
"""Encode a local image to base64."""
return base64.b64encode(Path(image_path).read_bytes()).decode()
def analyze_image(image_path: str, question: str) -> str:
"""Ask a question about a local image."""
base64_image = encode_image(image_path)
response = client.chat.completions.create(
model="gpt-4o",
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": question},
{
"type": "image_url",
"image_url": {
"url": f"data:image/png;base64,{base64_image}",
"detail": "high"
}
}
]
}
],
max_tokens=1000
)
return response.choices[0].message.content
# Analyze a screenshot
result = analyze_image(
"screenshot.png",
"Describe the UI layout and suggest improvements."
)
print(result)
# Compare two images
def compare_images(path1: str, path2: str) -> str:
return client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": "Compare these two images."},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(path1)}"}},
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{encode_image(path2)}"}}
]
}]
).choices[0].message.contentUse Cases
- Image analysis
- UI review
- Visual Q&A
- Accessibility audits
Tags
Related Snippets
Similar patterns you can reuse in the same workflow.
typescriptintermediate
OpenAI Vision API Image Analysis
Analyze images using GPT-4o vision capabilities with base64 and URL inputs.
Best for: image captioning
#ai#vision
pythonbeginner
OpenAI GPT-4 Vision Image Analysis
Analyse images from URLs or base64 with GPT-4 Vision for structured visual understanding.
Best for: visual Q&A
#openai#vision
pythonbeginner
Google Gemini Vision API in Python
Analyse images and PDFs using Google Gemini's multimodal vision API with the Python SDK.
Best for: image analysis
#gemini#vision
pythonadvanced
Multimodal RAG with Images and Text
Build a multimodal RAG pipeline that retrieves and answers questions about image+text documents.
Best for: visual document Q&A
#multimodal#rag