pythonadvanced

Prepare a Fine-Tuning Dataset for OpenAI

Format, validate, and upload training data for OpenAI model fine-tuning.

python
import json
from pathlib import Path
from openai import OpenAI

client = OpenAI()

def create_training_example(
    system: str, user: str, assistant: str
) -> dict:
    return {
        "messages": [
            {"role": "system", "content": system},
            {"role": "user", "content": user},
            {"role": "assistant", "content": assistant}
        ]
    }

# Build dataset
examples = [
    create_training_example(
        system="You are a code reviewer.",
        user="Review: `x = eval(input())`",
        assistant="CRITICAL: Never use eval() on user input. "
                  "Use ast.literal_eval() or explicit parsing."
    ),
    create_training_example(
        system="You are a code reviewer.",
        user="Review: `password = 'admin123'`",
        assistant="CRITICAL: Hardcoded credentials. "
                  "Use environment variables or a secrets manager."
    ),
]

# Write JSONL file
output = Path("training_data.jsonl")
with output.open("w") as f:
    for example in examples:
        f.write(json.dumps(example) + "\n")

# Validate format
def validate_dataset(path: str) -> bool:
    errors = []
    with open(path) as f:
        for i, line in enumerate(f, 1):
            try:
                data = json.loads(line)
                if "messages" not in data:
                    errors.append(f"Line {i}: missing 'messages'")
                roles = [m["role"] for m in data["messages"]]
                if roles[-1] != "assistant":
                    errors.append(f"Line {i}: must end with 'assistant'")
            except json.JSONDecodeError:
                errors.append(f"Line {i}: invalid JSON")
    for e in errors:
        print(e)
    return len(errors) == 0

if validate_dataset("training_data.jsonl"):
    # Upload and start fine-tuning
    file = client.files.create(
        file=output.open("rb"),
        purpose="fine-tune"
    )
    job = client.fine_tuning.jobs.create(
        training_file=file.id,
        model="gpt-4o-mini-2024-07-18"
    )
    print(f"Fine-tuning job: {job.id}")

Use Cases

  • Model customization
  • Domain-specific AI
  • Improving accuracy

Tags

Related Snippets

Similar patterns you can reuse in the same workflow.