typescriptadvanced

LLM Output Evaluation and Scoring

Evaluate LLM outputs programmatically with scoring rubrics for quality, relevance, and safety.

typescript
import OpenAI from 'openai';

const openai = new OpenAI();

interface EvalResult {
  score: number;
  passed: boolean;
  reasoning: string;
  criteria: Record<string, number>;
}

async function evaluateOutput(
  prompt: string,
  output: string,
  rubric: string
): Promise<EvalResult> {
  const evalPrompt = `You are an AI output evaluator. Score the following output.

Original Prompt: ${prompt}
Output to Evaluate: ${output}
Scoring Rubric: ${rubric}

Respond in JSON: {"score": <0-10>, "criteria": {"relevance": <0-10>, "accuracy": <0-10>, "completeness": <0-10>, "clarity": <0-10>}, "reasoning": "..."}`;

  const response = await openai.chat.completions.create({
    model: 'gpt-4o',
    messages: [{ role: 'user', content: evalPrompt }],
    response_format: { type: 'json_object' },
    temperature: 0,
  });

  const result = JSON.parse(response.choices[0].message.content!);
  return { ...result, passed: result.score >= 7 };
}

async function batchEvaluate(
  testCases: { prompt: string; expectedTopics: string[] }[],
  model: string
) {
  const results = await Promise.all(
    testCases.map(async (tc) => {
      const completion = await openai.chat.completions.create({
        model,
        messages: [{ role: 'user', content: tc.prompt }],
      });
      const output = completion.choices[0].message.content!;
      const evaluation = await evaluateOutput(
        tc.prompt, output,
        `Must cover topics: ${tc.expectedTopics.join(', ')}`
      );
      return { prompt: tc.prompt, output, evaluation };
    })
  );

  const avgScore = results.reduce((s, r) => s + r.evaluation.score, 0) / results.length;
  const passRate = results.filter((r) => r.evaluation.passed).length / results.length;
  console.log(`Average Score: ${avgScore.toFixed(1)}/10`);
  console.log(`Pass Rate: ${(passRate * 100).toFixed(0)}%`);
  return results;
}

Use Cases

  • prompt testing
  • model comparison
  • quality assurance

Tags

Related Snippets

Similar patterns you can reuse in the same workflow.