pythonadvanced
Prepare a Fine-Tuning Dataset for OpenAI
Format, validate, and upload training data for OpenAI model fine-tuning.
pythonPress β/Ctrl + Shift + C to copy
import json
from pathlib import Path
from openai import OpenAI
client = OpenAI()
def create_training_example(
system: str, user: str, assistant: str
) -> dict:
return {
"messages": [
{"role": "system", "content": system},
{"role": "user", "content": user},
{"role": "assistant", "content": assistant}
]
}
# Build dataset
examples = [
create_training_example(
system="You are a code reviewer.",
user="Review: `x = eval(input())`",
assistant="CRITICAL: Never use eval() on user input. "
"Use ast.literal_eval() or explicit parsing."
),
create_training_example(
system="You are a code reviewer.",
user="Review: `password = 'admin123'`",
assistant="CRITICAL: Hardcoded credentials. "
"Use environment variables or a secrets manager."
),
]
# Write JSONL file
output = Path("training_data.jsonl")
with output.open("w") as f:
for example in examples:
f.write(json.dumps(example) + "\n")
# Validate format
def validate_dataset(path: str) -> bool:
errors = []
with open(path) as f:
for i, line in enumerate(f, 1):
try:
data = json.loads(line)
if "messages" not in data:
errors.append(f"Line {i}: missing 'messages'")
roles = [m["role"] for m in data["messages"]]
if roles[-1] != "assistant":
errors.append(f"Line {i}: must end with 'assistant'")
except json.JSONDecodeError:
errors.append(f"Line {i}: invalid JSON")
for e in errors:
print(e)
return len(errors) == 0
if validate_dataset("training_data.jsonl"):
# Upload and start fine-tuning
file = client.files.create(
file=output.open("rb"),
purpose="fine-tune"
)
job = client.fine_tuning.jobs.create(
training_file=file.id,
model="gpt-4o-mini-2024-07-18"
)
print(f"Fine-tuning job: {job.id}")Use Cases
- Model customization
- Domain-specific AI
- Improving accuracy
Tags
Related Snippets
Similar patterns you can reuse in the same workflow.
pythonadvanced
Prepare Fine-Tuning Dataset for OpenAI
Build, validate, and upload a JSONL fine-tuning dataset for OpenAI GPT fine-tuning.
Best for: model customization
#openai#fine-tuning
typescriptbeginner
DALLΒ·E 3 Image Generation
Generate images from a text prompt using the OpenAI DALLΒ·E 3 API and return a URL.
Best for: AI art generation
#openai#dall-e
typescriptbeginner
Content Moderation with OpenAI
Check user input for harmful content using the OpenAI Moderation API before processing.
Best for: user input safety
#openai#moderation
typescriptintermediate
Next.js AI Streaming Route Handler
Stream OpenAI responses from a Next.js App Router route handler using the Vercel AI SDK.
Best for: AI chatbot backend
#nextjs#openai