Next you need to connect to Arize AX and enter the relevant keys.
import osfrom getpass import getpassimport nest_asyncionest_asyncio.apply()if not os.environ.get("SPACE_ID"): os.environ["SPACE_ID"] = getpass("🔑 Enter your Arize AX Space ID: ")if not os.environ.get("ARIZE_API_KEY"): os.environ["ARIZE_API_KEY"] = getpass("🔑 Enter your Arize AX API Key: ")if not os.environ.get("OPENAI_API_KEY"): os.environ["OPENAI_API_KEY"] = getpass("🔑 Enter your OpenAI API key: ")
Here we’ve setup a basic agent that can solve math problems.We have a function tool that can solve math equations, and an agent that can use this tool.We’ll use the Runner class to run the agent and get the final output.
from agents import function_tool, Runner@function_tooldef solve_equation(equation: str) -> str: """Use python to evaluate the math equation, instead of thinking about it yourself. Args:" equation: string which to pass into eval() in python """ return str(eval(equation))
from agents import Agentagent = Agent( name="Math Solver", instructions="You solve math problems by evaluating them with python and returning the result", tools=[solve_equation])
result = await Runner.run(agent, "what is 15 + 28?")# Run Result objectprint(result)# Get the final outputprint(result.final_output)# Get the entire list of messages recorded to generate the final outputprint(result.to_input_list())
Now we have a basic agent, let’s evaluate whether the agent responded correctly.
Agents can go awry for a variety of reasons. We can use Ragas to evaluate whether the agent responded correctly. Two Ragas measurements help with this:
Tool call accuracy - did our agent choose the right tool with the right arguments?
Agent goal accuracy - did our agent accomplish the stated goal and get to the right outcome?
Let’s setup our evaluation by defining our task function, our evaluator, and our dataset.
import asynciofrom agents import Runner# This is our task function. It takes a question and returns the final output and the messages recorded to generate the final output.async def solve_math_problem(input): if isinstance(input, dict): input = input.get("question") result = await Runner.run(agent, input) return { "final_output": result.final_output, "messages": result.to_input_list() }result = asyncio.run(solve_math_problem("What is 15 + 28?"))print(result)
This is helper code which converts the agent messages into a format that Ragas can use.
def conversation_to_ragas_sample(messages, reference_equation=None, reference_answer=None): """ Convert a single conversation into a Ragas MultiTurnSample. Args: conversation: Dictionary containing conversation data with 'conversation' key reference_equation: Optional string with the reference equation for evaluation Returns: MultiTurnSample: Formatted sample for Ragas evaluation """ from ragas.dataset_schema import MultiTurnSample from ragas.messages import HumanMessage, AIMessage, ToolMessage, ToolCall import json ragas_messages = [] pending_tool_call = None reference_tool_calls = None for item in messages: role = item.get("role") item_type = item.get("type") if role == "user": ragas_messages.append(HumanMessage(content=item["content"])) pending_tool_call = None elif item_type == "function_call": args = json.loads(item["arguments"]) pending_tool_call = ToolCall(name=item["name"], args=args) elif item_type == "function_call_output": if pending_tool_call is not None: ragas_messages.append(AIMessage(content="", tool_calls=[pending_tool_call])) ragas_messages.append(ToolMessage( content=item["output"], name=pending_tool_call.name, tool_call_id="tool_call_1", )) pending_tool_call = None else: print("[WARN] ToolMessage without preceding ToolCall — skipping.") elif role == "assistant": content = item["content"][0]["text"] if isinstance(item.get("content"), list) else item.get("content", "") ragas_messages.append(AIMessage(content=content)) print("Ragas_messages", ragas_messages) if reference_equation: # Look for the first function call to extract the actual tool call for item in messages: if item.get("type") == "function_call": args = json.loads(item["arguments"]) reference_tool_calls = [ToolCall(name=item["name"], args=args)] break return MultiTurnSample( user_input=ragas_messages, reference_tool_calls=reference_tool_calls ) elif reference_answer: return MultiTurnSample( user_input=ragas_messages, reference=reference_answer ) else: return MultiTurnSample(user_input=ragas_messages)
# Here is an example of the function in actionsample = conversation_to_ragas_sample( # This is a list of messages recorded for "Calculate 15 + 28." result["messages"], reference_equation="15 + 28", reference_answer="43")print(sample)
Now let’s setup our evaluator. We’ll import both metrics we’re measuring from Ragas, and use the multi_turn_ascore(sample) to get the results.The AgentGoalAccuracyWithReference metric compares the final output to the reference to see if the goal was accomplished.The ToolCallAccuracy metric compares the tool call to the reference tool call to see if the tool call was made correctly.
Using the template below, we’re going to generate a dataframe of 10 questions we can use to test our math problem solving agent.
MATH_GEN_TEMPLATE = """You are an assistant that generates diverse math problems for testing a math solver agent.The problems should include:Basic Operations: Simple addition, subtraction, multiplication, division problems.Complex Arithmetic: Problems with multiple operations and parentheses following order of operations.Exponents and Roots: Problems involving powers, square roots, and other nth roots.Percentages: Problems involving calculating percentages of numbers or finding percentage changes.Fractions: Problems with addition, subtraction, multiplication, or division of fractions.Algebra: Simple algebraic expressions that can be evaluated with specific values.Sequences: Finding sums, products, or averages of number sequences.Word Problems: Converting word problems into mathematical equations.Do not include any solutions in your generated problems.Respond with a list, one math problem per line. Do not include any numbering at the beginning of each line.Generate 10 diverse math problems. Ensure there are no duplicate problems."""
With our dataset of questions we generated above, we can use our experiments feature to track changes across models, prompts, parameters for our agent.Let’s create this dataset and upload it into the platform.
# Note: This example uses Python SDK v7from arize.experimental.datasets import ArizeDatasetsClientimport pandas as pdfrom uuid import uuid1from arize.experimental.datasets.utils.constants import GENERATIVEclient = ArizeDatasetsClient(api_key= os.environ.get("ARIZE_API_KEY"))dataset_df = pd.DataFrame({ "id": [f"id_{i}" for i in range(len(conversations))], "question": [conv["question"] for conv in conversations], "attributes.input.value": [conv["question"] for conv in conversations], "attributes.output.value": [conv["final_output"] for conv in conversations],})dataset_name = "math-questions-" + str(uuid1())[:5]dataset = client.create_dataset( space_id=os.environ.get("SPACE_ID"), dataset_name=dataset_name, data = dataset_df, dataset_type = GENERATIVE,)print(dataset_df.head())
Finally, we run our experiment and view the results.