pythonadvanced

RAG Evaluation with RAGAS

Evaluate RAG pipeline quality using RAGAS metrics: faithfulness, context recall, and answer relevance.

pythonPress ⌘/Ctrl + Shift + C to copy

from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_recall

data = {
    'question':         ['What is RAG?', 'What is LangChain?'],
    'answer':           ['RAG combines retrieval with generation to improve LLM accuracy.', 'LangChain is a framework for building LLM applications.'],
    'contexts':         [['RAG stands for Retrieval Augmented Generation. It fetches relevant documents and feeds them to an LLM.'], ['LangChain is an open-source Python framework for developing LLM-powered applications.']],
    'ground_truth':     ['RAG is a technique that retrieves relevant documents to improve LLM response quality.', 'LangChain helps developers build applications powered by large language models.'],
}

dataset = Dataset.from_dict(data)
results = evaluate(dataset, metrics=[faithfulness, answer_relevancy, context_recall])
print(results)
print(results.to_pandas()[['faithfulness','answer_relevancy','context_recall']])