typescriptadvanced
LLM Output Evaluation and Scoring
Evaluate LLM outputs programmatically with scoring rubrics for quality, relevance, and safety.
typescriptPress β/Ctrl + Shift + C to copy
import OpenAI from 'openai';
const openai = new OpenAI();
interface EvalResult {
score: number;
passed: boolean;
reasoning: string;
criteria: Record<string, number>;
}
async function evaluateOutput(
prompt: string,
output: string,
rubric: string
): Promise<EvalResult> {
const evalPrompt = `You are an AI output evaluator. Score the following output.
Original Prompt: ${prompt}
Output to Evaluate: ${output}
Scoring Rubric: ${rubric}
Respond in JSON: {"score": <0-10>, "criteria": {"relevance": <0-10>, "accuracy": <0-10>, "completeness": <0-10>, "clarity": <0-10>}, "reasoning": "..."}`;
const response = await openai.chat.completions.create({
model: 'gpt-4o',
messages: [{ role: 'user', content: evalPrompt }],
response_format: { type: 'json_object' },
temperature: 0,
});
const result = JSON.parse(response.choices[0].message.content!);
return { ...result, passed: result.score >= 7 };
}
async function batchEvaluate(
testCases: { prompt: string; expectedTopics: string[] }[],
model: string
) {
const results = await Promise.all(
testCases.map(async (tc) => {
const completion = await openai.chat.completions.create({
model,
messages: [{ role: 'user', content: tc.prompt }],
});
const output = completion.choices[0].message.content!;
const evaluation = await evaluateOutput(
tc.prompt, output,
`Must cover topics: ${tc.expectedTopics.join(', ')}`
);
return { prompt: tc.prompt, output, evaluation };
})
);
const avgScore = results.reduce((s, r) => s + r.evaluation.score, 0) / results.length;
const passRate = results.filter((r) => r.evaluation.passed).length / results.length;
console.log(`Average Score: ${avgScore.toFixed(1)}/10`);
console.log(`Pass Rate: ${(passRate * 100).toFixed(0)}%`);
return results;
}Use Cases
- prompt testing
- model comparison
- quality assurance
Tags
Related Snippets
Similar patterns you can reuse in the same workflow.
pythonintermediate
LLM Prompt Testing Framework
Write automated tests for LLM prompts using Python assertions to detect regressions.
Best for: prompt regression testing
#testing#prompts
pythonintermediate
LLM Testing with DeepEval
Write unit tests for LLM outputs using the DeepEval framework for correctness and hallucination detection.
Best for: LLM testing
#deepeval#testing
typescriptbeginner
Claude Messages API (Anthropic SDK)
Send messages to Claude using the official Anthropic SDK with system prompt and user turn.
Best for: AI assistant
#anthropic#claude
typescriptbeginner
DALLΒ·E 3 Image Generation
Generate images from a text prompt using the OpenAI DALLΒ·E 3 API and return a URL.
Best for: AI art generation
#openai#dall-e