pythonintermediate

RAG Retrieval Quality Metrics

Compute precision@k, recall@k, and MRR metrics to evaluate vector retrieval quality for RAG.

python
import numpy as np

def precision_at_k(retrieved: list[str], relevant: set[str], k: int) -> float:
    return sum(1 for doc in retrieved[:k] if doc in relevant) / k

def recall_at_k(retrieved: list[str], relevant: set[str], k: int) -> float:
    return sum(1 for doc in retrieved[:k] if doc in relevant) / len(relevant) if relevant else 0.0

def mrr(results: list[list[str]], relevant_sets: list[set[str]]) -> float:
    rr_scores = []
    for retrieved, relevant in zip(results, relevant_sets):
        for rank, doc in enumerate(retrieved, 1):
            if doc in relevant:
                rr_scores.append(1 / rank)
                break
        else:
            rr_scores.append(0.0)
    return np.mean(rr_scores)

queries = [
    {'retrieved': ['doc1','doc2','doc3','doc4','doc5'], 'relevant': {'doc1','doc3'}},
    {'retrieved': ['doc2','doc1','doc3','doc4','doc5'], 'relevant': {'doc1'}},
]

for i, q in enumerate(queries, 1):
    p5 = precision_at_k(q['retrieved'], q['relevant'], 5)
    r5 = recall_at_k(q['retrieved'], q['relevant'], 5)
    print(f'Q{i}: P@5={p5:.2f}, R@5={r5:.2f}')

print(f'MRR: {mrr([q["retrieved"] for q in queries], [q["relevant"] for q in queries]):.3f}')

Use Cases

  • retrieval benchmarking
  • RAG evaluation
  • search quality

Tags

Related Snippets

Similar patterns you can reuse in the same workflow.