Evaluations (evals) let you systematically test your agent's quality by running it against predefined test cases and scoring the responses.
Quick Start
alias ADK.Eval
alias ADK.Eval.{Case, Scorer}
# 1. Define your agent
agent = ADK.Agent.LlmAgent.new(
name: "my_agent",
model: "gemini-flash-latest",
instruction: "You are a helpful assistant."
)
runner = ADK.Runner.new(app_name: "my_app", agent: agent)
# 2. Define test cases
cases = [
Case.new(
name: "greeting",
input: "Hello!",
scorers: [
{Scorer.Contains, text: "hello", case_sensitive: false},
{Scorer.ResponseLength, min: 1, max: 500}
]
),
Case.new(
name: "factual_answer",
input: "What is 2 + 2?",
scorers: [
{Scorer.Contains, text: "4"}
]
)
]
# 3. Run evals
report = Eval.run(runner, cases)
IO.puts(Eval.Report.format(report))Built-in Scorers
ADK.Eval.Scorer.ExactMatch
Checks if the response exactly matches expected text.
{Scorer.ExactMatch, expected: "Hello, world!"}ADK.Eval.Scorer.Contains
Checks if the response contains a substring.
{Scorer.Contains, text: "hello", case_sensitive: false}ADK.Eval.Scorer.ResponseLength
Checks if the response length is within bounds.
{Scorer.ResponseLength, min: 10, max: 1000}ADK.Eval.Scorer.ToolUsed
Checks if the agent called a specific tool.
{Scorer.ToolUsed, name: "search"}Writing Custom Scorers
Implement the ADK.Eval.Scorer behaviour:
defmodule MyApp.Scorer.SentimentPositive do
@behaviour ADK.Eval.Scorer
@impl true
def score(events, _opts) do
text = ADK.Eval.Scorer.response_text(events)
# Your scoring logic here
positive_words = ~w(great good wonderful happy)
found = Enum.any?(positive_words, &String.contains?(String.downcase(text), &1))
if found do
%{score: 1.0, pass: true, details: nil}
else
%{score: 0.0, pass: false, details: "No positive sentiment detected"}
end
end
endUsing in ExUnit Tests
defmodule MyApp.AgentEvalTest do
use ExUnit.Case
test "agent passes basic eval suite" do
runner = build_runner()
cases = build_cases()
report = ADK.Eval.run(runner, cases, threshold: 0.8)
assert report.passed == report.total,
"Failed cases:\n" <> ADK.Eval.Report.format(report)
end
endOptions
ADK.Eval.run/3 accepts:
:threshold— minimum aggregate score for a case to pass (default:1.0):user_id— user ID for sessions (default:"eval_user")
Helper Functions
ADK.Eval.Scorer.response_text(events)— extract all text from agent response eventsADK.Eval.Scorer.function_calls(events)— extract all function calls from events