Skip to content

Commit 21891b4

Browse files
authored
feat: add tool-aware live-eval payloads and a deterministic tool-call accuracy scorer (#1055)
* feat: add tool-aware live-eval payloads and a deterministic tool-call accuracy scorer * chore: code reviews * chore: update changeset
1 parent 3556385 commit 21891b4

File tree

13 files changed

+1578
-3
lines changed

13 files changed

+1578
-3
lines changed

.changeset/good-bottles-hike.md

Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
---
2+
"@voltagent/core": minor
3+
"@voltagent/scorers": minor
4+
---
5+
6+
feat: add tool-aware live-eval payloads and a deterministic tool-call accuracy scorer
7+
8+
### What's New
9+
10+
- `@voltagent/core`
11+
- Live eval payload now includes `messages`, `toolCalls`, and `toolResults`.
12+
- If `toolCalls`/`toolResults` are not explicitly provided, they are derived from the normalized message/step chain.
13+
- New exported eval types: `AgentEvalToolCall` and `AgentEvalToolResult`.
14+
15+
- `@voltagent/scorers`
16+
- Added prebuilt `createToolCallAccuracyScorerCode` for deterministic tool evaluation.
17+
- Supports both single-tool checks (`expectedTool`) and ordered tool-chain checks (`expectedToolOrder`).
18+
- Supports strict and lenient matching modes.
19+
20+
### Code Examples
21+
22+
Built-in tool-call scorer:
23+
24+
```ts
25+
import { createToolCallAccuracyScorerCode } from "@voltagent/scorers";
26+
27+
const toolOrderScorer = createToolCallAccuracyScorerCode({
28+
expectedToolOrder: ["searchProducts", "checkInventory"],
29+
strictMode: false,
30+
});
31+
```
32+
33+
Custom scorer using `toolCalls` + `toolResults`:
34+
35+
```ts
36+
import { buildScorer } from "@voltagent/core";
37+
38+
interface ToolEvalPayload extends Record<string, unknown> {
39+
toolCalls?: Array<{ toolName?: string }>;
40+
toolResults?: Array<{ isError?: boolean; error?: unknown }>;
41+
}
42+
43+
const toolExecutionHealthScorer = buildScorer<ToolEvalPayload, Record<string, unknown>>({
44+
id: "tool-execution-health",
45+
label: "Tool Execution Health",
46+
})
47+
.score(({ payload }) => {
48+
const toolCalls = payload.toolCalls ?? [];
49+
const toolResults = payload.toolResults ?? [];
50+
51+
const failedResults = toolResults.filter(
52+
(result) => result.isError === true || Boolean(result.error)
53+
);
54+
const completionRatio =
55+
toolCalls.length === 0 ? 1 : Math.min(toolResults.length / toolCalls.length, 1);
56+
57+
return {
58+
score: Math.max(0, completionRatio - failedResults.length * 0.25),
59+
metadata: {
60+
toolCallCount: toolCalls.length,
61+
toolResultCount: toolResults.length,
62+
failedResultCount: failedResults.length,
63+
},
64+
};
65+
})
66+
.build();
67+
```

examples/with-live-evals/README.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,3 +51,9 @@ VoltAgent is an open-source TypeScript framework for creating and managing AI ag
5151
```bash
5252
npm create voltagent-app@latest -- --example with-live-evals
5353
```
54+
55+
## What This Example Demonstrates
56+
57+
- Live eval setup with multiple scorer types (heuristic, LLM-based, and custom)
58+
- Custom tool-call order scorer using `payload.toolCalls`
59+
- Custom scorer access to `payload.toolResults` for tool execution quality checks

examples/with-live-evals/src/index.ts

Lines changed: 136 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import VoltAgent, { Agent, VoltAgentObservability, buildScorer } from "@voltagent/core";
1+
import VoltAgent, { Agent, VoltAgentObservability, buildScorer, createTool } from "@voltagent/core";
22
import {
33
createAnswerCorrectnessScorer,
44
createAnswerRelevancyScorer,
@@ -10,6 +10,7 @@ import {
1010
createModerationScorer,
1111
createPossibleScorer,
1212
createSummaryScorer,
13+
createToolCallAccuracyScorerCode,
1314
createTranslationScorer,
1415
scorers,
1516
} from "@voltagent/scorers";
@@ -74,6 +75,115 @@ const customScorer = buildScorer({
7475
})
7576
.build();
7677

78+
const productCatalog = [
79+
{ id: "laptop-pro-13", name: "Laptop Pro 13", price: 1299, inStock: 8 },
80+
{ id: "laptop-air-14", name: "Laptop Air 14", price: 999, inStock: 14 },
81+
{ id: "office-monitor-27", name: "Office Monitor 27", price: 299, inStock: 0 },
82+
];
83+
84+
const searchProductsTool = createTool({
85+
name: "searchProducts",
86+
description: "Searches a small product catalog by query and returns product candidates.",
87+
parameters: z.object({
88+
query: z.string().describe("Product search query"),
89+
}),
90+
execute: async ({ query }: { query: string }) => {
91+
const normalizedQuery = query.toLowerCase();
92+
const matches = productCatalog.filter((product) =>
93+
product.name.toLowerCase().includes(normalizedQuery),
94+
);
95+
96+
return {
97+
query,
98+
total: matches.length,
99+
results: matches.map(({ id, name, price }) => ({ id, name, price })),
100+
};
101+
},
102+
});
103+
104+
const checkInventoryTool = createTool({
105+
name: "checkInventory",
106+
description: "Checks stock status for a product id.",
107+
parameters: z.object({
108+
productId: z.string().describe("Product id from searchProducts result"),
109+
}),
110+
execute: async ({ productId }: { productId: string }) => {
111+
const found = productCatalog.find((product) => product.id === productId);
112+
if (!found) {
113+
return {
114+
productId,
115+
isError: true,
116+
error: "Product not found",
117+
available: 0,
118+
};
119+
}
120+
121+
return {
122+
productId,
123+
available: found.inStock,
124+
isError: false,
125+
};
126+
},
127+
});
128+
129+
interface ToolEvalToolResult extends Record<string, unknown> {
130+
result?: unknown;
131+
isError?: boolean;
132+
error?: unknown;
133+
}
134+
135+
interface ToolEvalPayload extends Record<string, unknown> {
136+
toolCalls?: Array<{ toolName?: string }>;
137+
toolResults?: ToolEvalToolResult[];
138+
}
139+
140+
const toolCallOrderScorer = createToolCallAccuracyScorerCode<ToolEvalPayload>({
141+
expectedToolOrder: ["searchProducts", "checkInventory"],
142+
strictMode: false,
143+
});
144+
145+
const toolExecutionHealthScorer = buildScorer<ToolEvalPayload, Record<string, unknown>>({
146+
id: "tool-execution-health",
147+
label: "Tool Execution Health",
148+
})
149+
.score(({ payload }) => {
150+
const toolCalls = payload.toolCalls ?? [];
151+
const toolResults = payload.toolResults ?? [];
152+
153+
const calledToolNames = toolCalls
154+
.map((call) => call.toolName)
155+
.filter((name): name is string => Boolean(name));
156+
157+
const failedResults = toolResults.filter((toolResult) => {
158+
if (toolResult.isError === true || Boolean(toolResult.error)) {
159+
return true;
160+
}
161+
162+
if (toolResult.result && typeof toolResult.result === "object") {
163+
const resultRecord = toolResult.result as Record<string, unknown>;
164+
return resultRecord.isError === true || Boolean(resultRecord.error);
165+
}
166+
167+
return false;
168+
});
169+
170+
const completionRatio =
171+
toolCalls.length === 0 ? 1 : Math.min(toolResults.length / toolCalls.length, 1);
172+
const score = Math.max(0, completionRatio - failedResults.length * 0.25);
173+
174+
return {
175+
score,
176+
metadata: {
177+
calledToolNames,
178+
toolCallCount: toolCalls.length,
179+
toolResultCount: toolResults.length,
180+
failedResultCount: failedResults.length,
181+
completionRatio,
182+
},
183+
};
184+
})
185+
.build();
186+
77187
const HELPFULNESS_SCHEMA = z.object({
78188
score: z.number().min(0).max(1).describe("Score from 0 to 1 for helpfulness"),
79189
reason: z.string().describe("Explanation of the score"),
@@ -295,6 +405,26 @@ const supportAgent = new Agent({
295405
},
296406
});
297407

408+
const toolEvalAgent = new Agent({
409+
name: "tool-eval-demo",
410+
instructions: `You are a product assistant.
411+
Always call searchProducts first, then call checkInventory for a selected product before finalizing your answer.
412+
If no products are found, explain that clearly.`,
413+
model: "openai/gpt-4o-mini",
414+
tools: [searchProductsTool, checkInventoryTool],
415+
eval: {
416+
sampling: { type: "ratio", rate: 1 },
417+
scorers: {
418+
toolCallOrder: {
419+
scorer: toolCallOrderScorer,
420+
},
421+
toolExecutionHealth: {
422+
scorer: toolExecutionHealthScorer,
423+
},
424+
},
425+
},
426+
});
427+
298428
const singleEvalAgent = new Agent({
299429
name: "single-eval-demo",
300430
instructions: "You are a helpful assistant that answers questions about VoltAgent.",
@@ -340,6 +470,7 @@ const scorerFeedbackAgent = new Agent({
340470
new VoltAgent({
341471
agents: {
342472
support: supportAgent,
473+
toolEval: toolEvalAgent,
343474
singleEval: singleEvalAgent,
344475
scorerFeedback: scorerFeedbackAgent,
345476
},
@@ -350,7 +481,11 @@ new VoltAgent({
350481
(async () => {
351482
const question = "How can I enable live eval scorers in VoltAgent?";
352483
const result = await singleEvalAgent.generateText(question);
484+
const toolQuestion = "Find a laptop and check inventory before recommending one.";
485+
const toolResult = await toolEvalAgent.generateText(toolQuestion, { maxSteps: 4 });
353486

354487
console.log("Question:\n", question, "\n");
355488
console.log("Agent response:\n", result.text, "\n");
489+
console.log("Tool eval question:\n", toolQuestion, "\n");
490+
console.log("Tool eval response:\n", toolResult.text, "\n");
356491
})();

0 commit comments

Comments
 (0)