from phoenix.evals import (
HALLUCINATION_PROMPT_RAILS_MAP,
HALLUCINATION_PROMPT_TEMPLATE,
llm_classify,
OpenAIModel,
)
from arize.experiments import EvaluationResult, Evaluator
import pandas as pd
class HallucinationEvaluator(Evaluator):
def evaluate(self, output, dataset_row, **kwargs) -> EvaluationResult:
print("Evaluating outputs")
expected_output = dataset_row["attributes.llm.output_messages"]
# Create a DataFrame with the actual and expected outputs
df_in = pd.DataFrame(
{"selected_output": output, "expected_output": expected_output}, index=[0]
)
# Run the LLM classification
expect_df = llm_classify(
dataframe=df_in,
template=HALLUCINATION_PROMPT_TEMPLATE,
model=OpenAIModel(model="gpt-4o-mini", api_key=OPENAI_API_KEY),
rails=HALLUCINATION_PROMPT_RAILS_MAP,
provide_explanation=True,
)
label = expect_df["label"][0]
score = 1 if label == "factual" else 0
explanation = expect_df["explanation"][0]
return EvaluationResult(score=score, label=label, explanation=explanation)