import pandas as pd
from phoenix.evals import OpenAIModel, llm_classify
from arize.experimental.datasets.experiments.types import EvaluationResult
def correctness_eval(dataset_row: dict, output: dict) -> EvaluationResult:
# Create a dataframe with the question and answer
df_in = pd.DataFrame(
{"question": [dataset_row.get("question")], "response": [output]}
)
# Template for evaluating math problem solutions
MATH_EVAL_TEMPLATE = """
You are evaluating whether a math problem was solved correctly.
[BEGIN DATA]
************
[Question]: {question}
************
[Response]: {response}
[END DATA]
Assess if the answer to the math problem is correct. First work out the correct answer yourself,
then compare with the provided response. Consider that there may be different ways to express the same answer
(e.g., "43" vs "The answer is 43" or "5.0" vs "5").
Your answer must be a single word, either "correct" or "incorrect"
"""
# Run the evaluation
rails = ["correct", "incorrect"]
eval_df = llm_classify(
data=df_in,
template=MATH_EVAL_TEMPLATE,
model=OpenAIModel(model="gpt-4o"),
rails=rails,
provide_explanation=True,
)
# Extract results
label = eval_df["label"][0]
score = 1 if label == "correct" else 0
explanation = eval_df["explanation"][0]
# Return the evaluation result
return EvaluationResult(score=score, label=label, explanation=explanation)