def run_evaluation(collections=None, transactions=None, k_values=[1, 3, 5]):
"""Run evaluation across different k values using LangWatch tracking"""
# Initialize a new LangWatch evaluation experiment
evaluation = langwatch.experiment.init("embedding-model-evaluation")
results = []
for k in k_values:
for table in collections:
scores = []
# Use evaluation.loop() to track the iteration
for idx, transaction in evaluation.loop(enumerate(transactions)):
query = transaction['query']
expected_docs = [transaction['expected']]
# Retrieve documents
retrieved_docs = retrieve(query, table, k)
# Evaluate retrieval
metrics = evaluate_retrieval(retrieved_docs, expected_docs)
# Log individual transaction results to LangWatch
evaluation.log(
f"transaction_retrieval",
index=idx,
score=metrics["recall"],
data={
"query": query,
"expected": expected_docs,
"retrieved": retrieved_docs,
"k": k,
"collection": str(table),
"recall": metrics["recall"],
"mrr": metrics["mrr"]
}
)
scores.append({
"query": query,
"k": k,
"recall": metrics["recall"],
"mrr": metrics["mrr"]
})
# Calculate average metrics
avg_recall = sum(r["recall"] for r in scores) / len(scores)
avg_mrr = sum(r["mrr"] for r in scores) / len(scores)
# Log aggregate metrics to LangWatch
evaluation.log(
f"collection_performance_{str(table)}",
index=k, # Using k as the index
score=avg_recall,
data={
"collection": str(table),
"k": k,
"avg_recall": avg_recall,
"avg_mrr": avg_mrr
}
)
results.append({
"collection": table,
"k": k,
"avg_recall": avg_recall,
"avg_mrr": avg_mrr
})
return pd.DataFrame(results)