From 23f713fee1789628c3eb6c20e19272957dc203dc Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 16 Apr 2026 03:37:35 +0000 Subject: [PATCH 1/7] feat(core): expose {{ tool_calls }} template variable for LLM graders MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a new `{{ tool_calls }}` template variable that provides LLM graders with a formatted summary of tool calls from agent execution. Previously, LLM graders were blind to tool call details — only `{{ output }}` was available (plain text). The new variable formats each tool call as a compact line with the tool name and key input fields (skill name for Skill, file_path for Read/Write/Edit, command for Bash, pattern for Grep/Glob). Changes: - New `formatToolCalls()` utility in format-tool-calls.ts - Add `toolCalls` field to EvaluationContext interface - Add TOOL_CALLS to TEMPLATE_VARIABLES constants - Thread toolCalls through orchestrator pipeline (~15 sites) - Wire into all LLM grader prompt builders (~8 sites) - Auto-append `[[ ## tool_calls ## ]]` section in default templates - 12 new unit tests for formatToolCalls - Update docs site and skill references Closes #1121 --- .../content/docs/docs/graders/llm-graders.mdx | 2 + .../evaluation/graders/format-tool-calls.ts | 79 +++++++++++ packages/core/src/evaluation/graders/index.ts | 2 + .../evaluation/graders/llm-grader-prompt.ts | 32 ++++- .../core/src/evaluation/graders/llm-grader.ts | 24 +++- packages/core/src/evaluation/graders/types.ts | 2 + packages/core/src/evaluation/orchestrator.ts | 15 ++ .../core/src/evaluation/template-variables.ts | 2 + .../graders/format-tool-calls.test.ts | 132 ++++++++++++++++++ 9 files changed, 285 insertions(+), 5 deletions(-) create mode 100644 packages/core/src/evaluation/graders/format-tool-calls.ts create mode 100644 packages/core/test/evaluation/graders/format-tool-calls.test.ts diff --git a/apps/web/src/content/docs/docs/graders/llm-graders.mdx b/apps/web/src/content/docs/docs/graders/llm-graders.mdx index 3f9cd969..14f88ba6 100644 --- a/apps/web/src/content/docs/docs/graders/llm-graders.mdx +++ b/apps/web/src/content/docs/docs/graders/llm-graders.mdx @@ -73,6 +73,7 @@ Score the response from 0.0 to 1.0 based on: | `expected_output` | Full resolved expected array, JSON-serialized | | `output` | Full provider output array, JSON-serialized | | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) | +| `tool_calls` | Formatted summary of tool calls from agent execution (tool name + key inputs per call) | ## Per-Grader Target @@ -228,6 +229,7 @@ Derived strings injected into grader prompts: | `expected_output` | Full resolved expected array, JSON-serialized | | `output` | Full provider output array, JSON-serialized | | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) | +| `tool_calls` | Formatted summary of tool calls from agent execution (tool name + key inputs per call) | **Example flow:** diff --git a/packages/core/src/evaluation/graders/format-tool-calls.ts b/packages/core/src/evaluation/graders/format-tool-calls.ts new file mode 100644 index 00000000..148df281 --- /dev/null +++ b/packages/core/src/evaluation/graders/format-tool-calls.ts @@ -0,0 +1,79 @@ +/** + * Formats tool calls from agent output messages into a human-readable summary. + * + * Used by `{{ tool_calls }}` template variable in LLM grader prompts. + * Extracts key input fields per tool to keep the summary compact: + * - Skill: `skill` arg + * - Read/Write/Edit: `file_path` + * - Bash: `command` + * - Grep/Glob: `pattern` + * - Other tools: first string-valued input field (if any) + * + * Returns empty string when there are no tool calls (template variable resolves to ''). + */ + +import type { Message } from '../providers/types.js'; + +/** + * Key input fields to extract per tool name. + * Order matters — first matching field wins. + */ +const KEY_INPUT_FIELDS: ReadonlyMap = new Map([ + ['Skill', ['skill']], + ['Read', ['file_path']], + ['Write', ['file_path']], + ['Edit', ['file_path']], + ['Bash', ['command']], + ['Grep', ['pattern']], + ['Glob', ['pattern']], +]); + +/** Fallback: pick the first short string-valued field from input. */ +const MAX_FALLBACK_LENGTH = 120; + +export function formatToolCalls(output: readonly Message[] | undefined): string { + if (!output) return ''; + + const lines: string[] = []; + + for (const message of output) { + if (!message.toolCalls) continue; + for (const call of message.toolCalls) { + const toolName = call.tool ?? 'unknown'; + const detail = extractKeyDetail(toolName, call.input); + lines.push(detail ? `- ${toolName}: ${detail}` : `- ${toolName}`); + } + } + + return lines.length > 0 ? lines.join('\n') : ''; +} + +function extractKeyDetail(toolName: string, input: unknown): string { + if (!input || typeof input !== 'object') return ''; + const record = input as Record; + + // Try known key fields for this tool + const knownFields = KEY_INPUT_FIELDS.get(toolName); + if (knownFields) { + for (const field of knownFields) { + const value = record[field]; + if (typeof value === 'string' && value.length > 0) { + return truncate(value); + } + } + } + + // Fallback: first short string-valued field + for (const value of Object.values(record)) { + if (typeof value === 'string' && value.length > 0 && value.length <= MAX_FALLBACK_LENGTH) { + return truncate(value); + } + } + + return ''; +} + +function truncate(value: string, maxLen = 120): string { + if (value.length <= maxLen) return value; + return `${value.slice(0, maxLen)}…`; +} diff --git a/packages/core/src/evaluation/graders/index.ts b/packages/core/src/evaluation/graders/index.ts index 107582ae..c87ff4a5 100644 --- a/packages/core/src/evaluation/graders/index.ts +++ b/packages/core/src/evaluation/graders/index.ts @@ -55,6 +55,8 @@ export { } from './llm-grader.js'; export type { LlmGraderOptions } from './llm-grader.js'; +export { formatToolCalls } from './format-tool-calls.js'; + export { SkillTriggerGrader } from './skill-trigger.js'; export { assembleLlmGraderPrompt } from './llm-grader-prompt.js'; diff --git a/packages/core/src/evaluation/graders/llm-grader-prompt.ts b/packages/core/src/evaluation/graders/llm-grader-prompt.ts index 1cc7774b..fc50aae4 100644 --- a/packages/core/src/evaluation/graders/llm-grader-prompt.ts +++ b/packages/core/src/evaluation/graders/llm-grader-prompt.ts @@ -24,6 +24,7 @@ export function assembleLlmGraderPrompt(input: { evaluatorConfig?: LlmGraderConfig; output?: readonly Message[]; fileChanges?: string; + toolCalls?: string; graderTemplateOverride?: string; }): LlmGraderPromptAssembly { const { @@ -32,6 +33,7 @@ export function assembleLlmGraderPrompt(input: { promptInputs, evaluatorConfig, fileChanges, + toolCalls, graderTemplateOverride, } = input; @@ -41,12 +43,19 @@ export function assembleLlmGraderPrompt(input: { if (rubrics && rubrics.length > 0) { const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0); if (hasScoreRanges) { - return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges); + return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls); } - return assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges); + return assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls); } - return assembleFreeform(evalCase, candidate, promptInputs, fileChanges, graderTemplateOverride); + return assembleFreeform( + evalCase, + candidate, + promptInputs, + fileChanges, + toolCalls, + graderTemplateOverride, + ); } function assembleFreeform( @@ -54,6 +63,7 @@ function assembleFreeform( candidate: string, promptInputs: PromptInputs, fileChanges?: string, + toolCalls?: string, graderTemplateOverride?: string, ): LlmGraderPromptAssembly { const formattedQuestion = @@ -67,6 +77,7 @@ function assembleFreeform( [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? '').trim(), [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(), [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? '', + [TEMPLATE_VARIABLES.TOOL_CALLS]: toolCalls ?? '', // Deprecated aliases [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(), @@ -77,10 +88,13 @@ function assembleFreeform( const template = graderTemplateOverride ?? DEFAULT_GRADER_TEMPLATE; let userPrompt = substituteVariables(template, variables); - // Append file_changes section to default template only when present + // Append file_changes and tool_calls sections to default template only when present if (fileChanges && !graderTemplateOverride) { userPrompt += `\n\n[[ ## file_changes ## ]]\n${fileChanges}`; } + if (toolCalls && !graderTemplateOverride) { + userPrompt += `\n\n[[ ## tool_calls ## ]]\n${toolCalls}`; + } return { systemPrompt, @@ -96,6 +110,7 @@ function assembleChecklist( promptInputs: PromptInputs, rubrics: readonly RubricItem[], fileChanges?: string, + toolCalls?: string, ): LlmGraderPromptAssembly { const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 @@ -123,6 +138,10 @@ function assembleChecklist( parts.push('[[ ## file_changes ## ]]', fileChanges, ''); } + if (toolCalls) { + parts.push('[[ ## tool_calls ## ]]', toolCalls, ''); + } + parts.push('[[ ## rubrics ## ]]'); for (const rubric of rubrics) { @@ -150,6 +169,7 @@ function assembleScoreRange( promptInputs: PromptInputs, rubrics: readonly RubricItem[], fileChanges?: string, + toolCalls?: string, ): LlmGraderPromptAssembly { const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 @@ -178,6 +198,10 @@ function assembleScoreRange( parts.push('[[ ## file_changes ## ]]', fileChanges, ''); } + if (toolCalls) { + parts.push('[[ ## tool_calls ## ]]', toolCalls, ''); + } + parts.push('[[ ## scoring_criteria ## ]]'); for (const rubric of rubrics) { diff --git a/packages/core/src/evaluation/graders/llm-grader.ts b/packages/core/src/evaluation/graders/llm-grader.ts index 47812ef4..15e41ab9 100644 --- a/packages/core/src/evaluation/graders/llm-grader.ts +++ b/packages/core/src/evaluation/graders/llm-grader.ts @@ -272,6 +272,7 @@ export class LlmGrader implements Grader { [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(), [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', + [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? '', // Deprecated aliases — same values as the primary variables above [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(), @@ -290,10 +291,13 @@ export class LlmGrader implements Grader { let userPrompt = substituteVariables(graderTemplate, variables); - // Append file_changes section to default template only when present + // Append file_changes and tool_calls sections to default template only when present if (context.fileChanges && !context.graderTemplateOverride && !this.graderTemplate) { userPrompt += `\n\n[[ ## file_changes ## ]]\n${context.fileChanges}`; } + if (context.toolCalls && !context.graderTemplateOverride && !this.graderTemplate) { + userPrompt += `\n\n[[ ## tool_calls ## ]]\n${context.toolCalls}`; + } const graderRawRequest: JsonObject = { userPrompt, @@ -691,6 +695,7 @@ export class LlmGrader implements Grader { [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(), [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(), [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', + [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? '', // Deprecated aliases [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(), @@ -726,6 +731,10 @@ export class LlmGrader implements Grader { parts.push('[[ ## file_changes ## ]]', context.fileChanges, ''); } + if (context.toolCalls) { + parts.push('[[ ## tool_calls ## ]]', context.toolCalls, ''); + } + if (rubrics && rubrics.length > 0) { parts.push('[[ ## rubrics ## ]]'); for (const rubric of rubrics) { @@ -766,6 +775,7 @@ export class LlmGrader implements Grader { [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(), [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(), [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', + [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? '', // Deprecated aliases [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(), @@ -801,6 +811,10 @@ export class LlmGrader implements Grader { parts.push('[[ ## file_changes ## ]]', context.fileChanges, ''); } + if (context.toolCalls) { + parts.push('[[ ## tool_calls ## ]]', context.toolCalls, ''); + } + if (rubrics && rubrics.length > 0) { parts.push('[[ ## rubrics ## ]]'); for (const rubric of rubrics) { @@ -923,6 +937,10 @@ export class LlmGrader implements Grader { parts.push('[[ ## file_changes ## ]]', context.fileChanges, ''); } + if (context.toolCalls) { + parts.push('[[ ## tool_calls ## ]]', context.toolCalls, ''); + } + parts.push('[[ ## scoring_criteria ## ]]'); for (const rubric of rubrics) { @@ -985,6 +1003,10 @@ export class LlmGrader implements Grader { parts.push('[[ ## file_changes ## ]]', context.fileChanges, ''); } + if (context.toolCalls) { + parts.push('[[ ## tool_calls ## ]]', context.toolCalls, ''); + } + parts.push('[[ ## rubrics ## ]]'); for (const rubric of rubrics) { diff --git a/packages/core/src/evaluation/graders/types.ts b/packages/core/src/evaluation/graders/types.ts index 14e4fd44..1d548e5f 100644 --- a/packages/core/src/evaluation/graders/types.ts +++ b/packages/core/src/evaluation/graders/types.ts @@ -55,6 +55,8 @@ export interface EvaluationContext { readonly availableTargets?: readonly string[]; /** Unified diff of file changes from workspace */ readonly fileChanges?: string; + /** Formatted summary of tool calls from agent execution */ + readonly toolCalls?: string; /** Absolute path to the workspace directory */ readonly workspacePath?: string; /** Docker workspace config: when present, code-grader commands run inside a container */ diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 6a1a3a0c..1e4b24e1 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -15,6 +15,7 @@ import { type EvaluationScore, type Grader, LlmGrader, + formatToolCalls, negateScore, scoreToVerdict, } from './graders.js'; @@ -2271,6 +2272,9 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise Provider | undefined; readonly availableTargets?: readonly string[]; readonly fileChanges?: string; + readonly toolCalls?: string; readonly workspacePath?: string; readonly dockerConfig?: import('./types.js').DockerWorkspaceConfig; readonly verbose?: boolean; @@ -2659,6 +2665,7 @@ async function evaluateCandidate(options: { targetResolver, availableTargets, fileChanges, + toolCalls, workspacePath, dockerConfig, threshold: evalThreshold, @@ -2688,6 +2695,7 @@ async function evaluateCandidate(options: { targetResolver, availableTargets, fileChanges, + toolCalls, workspacePath, dockerConfig, threshold: evalThreshold, @@ -2775,6 +2783,7 @@ async function runEvaluatorsForCase(options: { readonly targetResolver?: (name: string) => Provider | undefined; readonly availableTargets?: readonly string[]; readonly fileChanges?: string; + readonly toolCalls?: string; readonly workspacePath?: string; readonly dockerConfig?: import('./types.js').DockerWorkspaceConfig; readonly threshold?: number; @@ -2802,6 +2811,7 @@ async function runEvaluatorsForCase(options: { targetResolver, availableTargets, fileChanges, + toolCalls, workspacePath, dockerConfig, threshold, @@ -2832,6 +2842,7 @@ async function runEvaluatorsForCase(options: { targetResolver, availableTargets, fileChanges, + toolCalls, workspacePath, dockerConfig, threshold, @@ -2868,6 +2879,7 @@ async function runEvaluatorsForCase(options: { targetResolver, availableTargets, fileChanges, + toolCalls, workspacePath, dockerConfig, dependencyResults, @@ -2914,6 +2926,7 @@ async function runEvaluatorList(options: { readonly targetResolver?: (name: string) => Provider | undefined; readonly availableTargets?: readonly string[]; readonly fileChanges?: string; + readonly toolCalls?: string; readonly workspacePath?: string; readonly dockerConfig?: import('./types.js').DockerWorkspaceConfig; readonly threshold?: number; @@ -2942,6 +2955,7 @@ async function runEvaluatorList(options: { targetResolver, availableTargets, fileChanges, + toolCalls, workspacePath, dockerConfig, dependencyResults, @@ -2977,6 +2991,7 @@ async function runEvaluatorList(options: { targetResolver, availableTargets, fileChanges, + toolCalls, workspacePath, dockerConfig, dependencyResults, diff --git a/packages/core/src/evaluation/template-variables.ts b/packages/core/src/evaluation/template-variables.ts index 508d837d..9d92f0d8 100644 --- a/packages/core/src/evaluation/template-variables.ts +++ b/packages/core/src/evaluation/template-variables.ts @@ -8,6 +8,7 @@ * - {{ expected_output }} — reference answer as plain text * - {{ criteria }} — evaluation criteria string * - {{ file_changes }} — file diff (if available) + * - {{ tool_calls }} — formatted summary of tool calls from agent execution * * Deprecated aliases (emit a warning when used in custom templates): * - {{ input_text }} → use {{ input }} @@ -20,6 +21,7 @@ export const TEMPLATE_VARIABLES = { INPUT: 'input', OUTPUT: 'output', FILE_CHANGES: 'file_changes', + TOOL_CALLS: 'tool_calls', /** @deprecated Use INPUT instead — resolves to the same text value. */ INPUT_TEXT: 'input_text', /** @deprecated Use OUTPUT instead — resolves to the same text value. */ diff --git a/packages/core/test/evaluation/graders/format-tool-calls.test.ts b/packages/core/test/evaluation/graders/format-tool-calls.test.ts new file mode 100644 index 00000000..6ce1122c --- /dev/null +++ b/packages/core/test/evaluation/graders/format-tool-calls.test.ts @@ -0,0 +1,132 @@ +import { describe, expect, it } from 'vitest'; +import { formatToolCalls } from '../../../src/evaluation/graders/format-tool-calls.js'; +import type { Message } from '../../../src/evaluation/providers/types.js'; + +describe('formatToolCalls', () => { + it('returns empty string for undefined output', () => { + expect(formatToolCalls(undefined)).toBe(''); + }); + + it('returns empty string for empty messages array', () => { + expect(formatToolCalls([])).toBe(''); + }); + + it('returns empty string when no messages have tool calls', () => { + const messages: Message[] = [ + { role: 'assistant', content: 'Hello' }, + { role: 'user', content: 'Hi' }, + ]; + expect(formatToolCalls(messages)).toBe(''); + }); + + it('formats Skill tool calls with skill name', () => { + const messages: Message[] = [ + { + role: 'assistant', + toolCalls: [{ tool: 'Skill', input: { skill: 'commit' } }], + }, + ]; + expect(formatToolCalls(messages)).toBe('- Skill: commit'); + }); + + it('formats Read/Write/Edit tool calls with file_path', () => { + const messages: Message[] = [ + { + role: 'assistant', + toolCalls: [ + { tool: 'Read', input: { file_path: '/src/index.ts' } }, + { tool: 'Write', input: { file_path: '/src/output.ts', content: '...' } }, + { tool: 'Edit', input: { file_path: '/src/edit.ts', old_string: 'a', new_string: 'b' } }, + ], + }, + ]; + const result = formatToolCalls(messages); + expect(result).toBe('- Read: /src/index.ts\n- Write: /src/output.ts\n- Edit: /src/edit.ts'); + }); + + it('formats Bash tool calls with command', () => { + const messages: Message[] = [ + { + role: 'assistant', + toolCalls: [{ tool: 'Bash', input: { command: 'npm test' } }], + }, + ]; + expect(formatToolCalls(messages)).toBe('- Bash: npm test'); + }); + + it('formats Grep/Glob tool calls with pattern', () => { + const messages: Message[] = [ + { + role: 'assistant', + toolCalls: [ + { tool: 'Grep', input: { pattern: 'TODO', path: '/src' } }, + { tool: 'Glob', input: { pattern: '**/*.ts' } }, + ], + }, + ]; + expect(formatToolCalls(messages)).toBe('- Grep: TODO\n- Glob: **/*.ts'); + }); + + it('formats mixed tool calls across multiple messages', () => { + const messages: Message[] = [ + { + role: 'assistant', + toolCalls: [ + { tool: 'Read', input: { file_path: '/package.json' } }, + { tool: 'Bash', input: { command: 'ls -la' } }, + ], + }, + { role: 'user', content: 'ok' }, + { + role: 'assistant', + toolCalls: [{ tool: 'Skill', input: { skill: 'review-pr' } }], + }, + ]; + const result = formatToolCalls(messages); + expect(result).toBe('- Read: /package.json\n- Bash: ls -la\n- Skill: review-pr'); + }); + + it('falls back to first short string field for unknown tools', () => { + const messages: Message[] = [ + { + role: 'assistant', + toolCalls: [{ tool: 'CustomTool', input: { query: 'find me something' } }], + }, + ]; + expect(formatToolCalls(messages)).toBe('- CustomTool: find me something'); + }); + + it('shows tool name only when input is empty', () => { + const messages: Message[] = [ + { + role: 'assistant', + toolCalls: [{ tool: 'SomeTool', input: {} }], + }, + ]; + expect(formatToolCalls(messages)).toBe('- SomeTool'); + }); + + it('shows tool name only when input is undefined', () => { + const messages: Message[] = [ + { + role: 'assistant', + toolCalls: [{ tool: 'SomeTool' }], + }, + ]; + expect(formatToolCalls(messages)).toBe('- SomeTool'); + }); + + it('truncates long input values', () => { + const longCommand = 'x'.repeat(200); + const messages: Message[] = [ + { + role: 'assistant', + toolCalls: [{ tool: 'Bash', input: { command: longCommand } }], + }, + ]; + const result = formatToolCalls(messages); + expect(result).toContain('- Bash: '); + // 120 chars + ellipsis + expect(result.length).toBeLessThan(200); + }); +}); From 370c6c4e6ab1e5caad8f39da72d3c9471ee87fff Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 16 Apr 2026 03:45:38 +0000 Subject: [PATCH 2/7] feat(examples): add tool-calls-template example for {{ tool_calls }} variable MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Demonstrates using {{ tool_calls }} in LLM grader prompts to verify skill invocation — an alternative to the deterministic skill-trigger grader when LLM reasoning is needed. Includes: - Mock CLI agent returning Skill/Read/Edit/Bash tool calls - LLM grader prompts using {{ tool_calls }} for positive/negative cases - 3 test cases: deploy skill, review-pr skill, no-skill bugfix --- .../tool-calls-template/.agentv/targets.yaml | 13 ++ .../evals/dataset.eval.yaml | 72 ++++++++++ .../tool-calls-template/mock-agent.ts | 124 ++++++++++++++++++ .../prompts/no-skill-check.md | 15 +++ .../prompts/skill-usage-check.md | 15 +++ 5 files changed, 239 insertions(+) create mode 100644 examples/features/tool-calls-template/.agentv/targets.yaml create mode 100644 examples/features/tool-calls-template/evals/dataset.eval.yaml create mode 100644 examples/features/tool-calls-template/mock-agent.ts create mode 100644 examples/features/tool-calls-template/prompts/no-skill-check.md create mode 100644 examples/features/tool-calls-template/prompts/skill-usage-check.md diff --git a/examples/features/tool-calls-template/.agentv/targets.yaml b/examples/features/tool-calls-template/.agentv/targets.yaml new file mode 100644 index 00000000..e299bb2b --- /dev/null +++ b/examples/features/tool-calls-template/.agentv/targets.yaml @@ -0,0 +1,13 @@ +targets: + - name: mock_agent + provider: cli + grader_target: grader + command: bun run ./mock-agent.ts --prompt {PROMPT} --output {OUTPUT_FILE} + cwd: .. + healthcheck: + command: bun run ./mock-agent.ts --healthcheck + cwd: .. + + - name: grader + provider: openrouter + model: anthropic/claude-sonnet-4 diff --git a/examples/features/tool-calls-template/evals/dataset.eval.yaml b/examples/features/tool-calls-template/evals/dataset.eval.yaml new file mode 100644 index 00000000..236abc2b --- /dev/null +++ b/examples/features/tool-calls-template/evals/dataset.eval.yaml @@ -0,0 +1,72 @@ +# Tool Calls Template Variable Demo +# +# Demonstrates using {{ tool_calls }} in LLM grader prompts to check +# whether an agent invoked the right skills during execution. +# +# The {{ tool_calls }} variable provides a formatted summary of all tool +# calls from the agent's output (tool name + key inputs). This lets LLM +# graders reason about tool usage without parsing raw Message[] data. +# +# This example uses a mock CLI agent that returns tool calls in its output. +# The LLM grader then inspects the tool call summary to verify skill usage. +# +# Setup: +# 1. Create examples/features/.env with your OPENROUTER_API_KEY +# 2. Run: cd examples/features && bun agentv eval tool-calls-template/evals/dataset.eval.yaml --target mock_agent + +name: tool-calls-template +description: LLM grader with {{ tool_calls }} template variable for skill verification + +execution: + target: mock_agent + +tests: + # ========================================== + # Example 1: Verify deploy skill was triggered + # The mock agent calls Skill(acme-deploy) + Bash(kubectl) + # LLM grader checks tool_calls for Skill invocation + # ========================================== + - id: deploy-skill-triggered + criteria: |- + The agent should invoke the acme-deploy skill to deploy the + payments-api service to production. + input: + - role: user + content: Deploy payments-api to production using the deploy skill. + assertions: + - name: skill-invoked + type: llm-grader + prompt: ../prompts/skill-usage-check.md + + # ========================================== + # Example 2: Verify review-pr skill was triggered + # The mock agent calls Skill(review-pr) + Read(files) + # LLM grader checks tool_calls for skill invocation + # ========================================== + - id: review-skill-triggered + criteria: |- + The agent should invoke the review-pr skill to review pull request #42. + input: + - role: user + content: Review pull request #42 using the review skill. + assertions: + - name: skill-invoked + type: llm-grader + prompt: ../prompts/skill-usage-check.md + + # ========================================== + # Example 3: Verify NO skill was triggered (negative case) + # The mock agent only calls Read/Edit/Bash — no Skill tool + # LLM grader checks tool_calls to confirm absence of Skill + # ========================================== + - id: no-skill-for-bugfix + criteria: |- + The agent should fix the bug using basic tools (Read, Edit, Bash) + without invoking any skill. + input: + - role: user + content: Fix the null pointer bug in auth.ts. + assertions: + - name: no-skill-used + type: llm-grader + prompt: ../prompts/no-skill-check.md diff --git a/examples/features/tool-calls-template/mock-agent.ts b/examples/features/tool-calls-template/mock-agent.ts new file mode 100644 index 00000000..92d2eda0 --- /dev/null +++ b/examples/features/tool-calls-template/mock-agent.ts @@ -0,0 +1,124 @@ +#!/usr/bin/env bun +/** + * Mock Agent CLI for {{ tool_calls }} template variable demo. + * + * Simulates an agent that invokes skills and tools, returning tool call data + * in the output. Used to demonstrate LLM grader assertions that inspect + * tool calls via the {{ tool_calls }} template variable. + * + * Usage: + * bun run mock-agent.ts --prompt "..." --output output.json + * bun run mock-agent.ts --healthcheck + */ + +import { writeFileSync } from 'node:fs'; +import { parseArgs } from 'node:util'; + +interface ToolCall { + tool: string; + input?: unknown; + output?: unknown; +} + +interface Message { + role: 'assistant'; + content: string; + tool_calls?: ToolCall[]; +} + +interface AgentResponse { + output: Message[]; +} + +function generateResponse(prompt: string): AgentResponse { + const lower = prompt.toLowerCase(); + + // Scenario 1: Deploy request → triggers deploy skill + if (lower.includes('deploy')) { + return { + output: [ + { + role: 'assistant', + content: 'Deployment initiated for payments-api to production.', + tool_calls: [ + { + tool: 'Skill', + input: { skill: 'acme-deploy', args: '--service payments-api --env production' }, + }, + { tool: 'Bash', input: { command: 'kubectl rollout status deployment/payments-api' } }, + ], + }, + ], + }; + } + + // Scenario 2: Code review → triggers review-pr skill + if (lower.includes('review') || lower.includes('pull request')) { + return { + output: [ + { + role: 'assistant', + content: 'I reviewed the pull request and found no issues.', + tool_calls: [ + { tool: 'Skill', input: { skill: 'review-pr', args: '42' } }, + { tool: 'Read', input: { file_path: '/src/auth.ts' } }, + { tool: 'Read', input: { file_path: '/src/auth.test.ts' } }, + ], + }, + ], + }; + } + + // Scenario 3: File editing without skill invocation + if (lower.includes('fix') || lower.includes('bug')) { + return { + output: [ + { + role: 'assistant', + content: 'Fixed the null pointer bug in auth.ts.', + tool_calls: [ + { tool: 'Read', input: { file_path: '/src/auth.ts' } }, + { tool: 'Edit', input: { file_path: '/src/auth.ts' } }, + { tool: 'Bash', input: { command: 'npm test' } }, + ], + }, + ], + }; + } + + // Default: no tools + return { + output: [{ role: 'assistant', content: 'I processed your request.' }], + }; +} + +function main(): void { + const { values } = parseArgs({ + options: { + prompt: { type: 'string', short: 'p' }, + output: { type: 'string', short: 'o' }, + healthcheck: { type: 'boolean' }, + }, + allowPositionals: true, + }); + + if (values.healthcheck) { + console.log('OK'); + process.exit(0); + } + + if (!values.prompt || !values.output) { + console.error('Error: --prompt and --output are required'); + process.exit(1); + } + + const response = generateResponse(values.prompt); + writeFileSync(values.output, JSON.stringify(response, null, 2)); + + const firstMessage = response.output[0]; + if (firstMessage) { + console.log(firstMessage.content); + } +} + +main(); diff --git a/examples/features/tool-calls-template/prompts/no-skill-check.md b/examples/features/tool-calls-template/prompts/no-skill-check.md new file mode 100644 index 00000000..c9e7f3f0 --- /dev/null +++ b/examples/features/tool-calls-template/prompts/no-skill-check.md @@ -0,0 +1,15 @@ +You are evaluating whether an AI agent completed a task WITHOUT using a skill. + +[[ ## question ## ]] +{{ input }} + +[[ ## criteria ## ]] +{{ criteria }} + +[[ ## answer ## ]] +{{ output }} + +[[ ## tool_calls ## ]] +{{ tool_calls }} + +Based on the tool calls above, determine whether the agent completed the task using only basic tools (Read, Edit, Bash, etc.) without invoking any Skill tool. Score 1.0 if no Skill tool was used, 0.0 if a Skill tool was invoked. diff --git a/examples/features/tool-calls-template/prompts/skill-usage-check.md b/examples/features/tool-calls-template/prompts/skill-usage-check.md new file mode 100644 index 00000000..0f79d674 --- /dev/null +++ b/examples/features/tool-calls-template/prompts/skill-usage-check.md @@ -0,0 +1,15 @@ +You are evaluating whether an AI agent correctly used the expected skill. + +[[ ## question ## ]] +{{ input }} + +[[ ## criteria ## ]] +{{ criteria }} + +[[ ## answer ## ]] +{{ output }} + +[[ ## tool_calls ## ]] +{{ tool_calls }} + +Based on the tool calls above, determine whether the agent invoked the correct skill as required by the criteria. Score 1.0 if the expected skill was triggered with appropriate arguments, 0.0 if not. From 8eee6bf2db830d8ac51c3d9e7da9bd3bbd2460d4 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 16 Apr 2026 03:58:15 +0000 Subject: [PATCH 3/7] fix(examples): use root targets.yaml and file:// prompt prefix for tool-calls example Move mock_agent and openrouter_grader targets to root .agentv/targets.yaml instead of a per-example targets file. Fix prompt references to use file:// prefix so they're resolved as file paths rather than inline text. Co-Authored-By: Claude Opus 4.6 --- .agentv/targets.yaml | 13 +++++++++++++ .../tool-calls-template/.agentv/targets.yaml | 13 ------------- .../tool-calls-template/evals/dataset.eval.yaml | 10 +++++----- 3 files changed, 18 insertions(+), 18 deletions(-) delete mode 100644 examples/features/tool-calls-template/.agentv/targets.yaml diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml index 067a75bf..3b46f5ce 100644 --- a/.agentv/targets.yaml +++ b/.agentv/targets.yaml @@ -117,6 +117,19 @@ targets: log_dir: ${{ CODEX_LOG_DIR }} log_format: json + # ── Mock targets (for feature examples) ────────────────────────── + - name: mock_agent + provider: cli + grader_target: openrouter_grader + command: bun run ../mock-agent.ts --prompt {PROMPT} --output {OUTPUT_FILE} + healthcheck: + command: bun run ../mock-agent.ts --healthcheck + + - name: openrouter_grader + provider: openrouter + api_key: ${{ OPENROUTER_API_KEY }} + model: ${{ OPENROUTER_GRADER_MODEL }} + # ── LLM targets (direct model access) ───────────────────────────── - name: gh-models provider: openai diff --git a/examples/features/tool-calls-template/.agentv/targets.yaml b/examples/features/tool-calls-template/.agentv/targets.yaml deleted file mode 100644 index e299bb2b..00000000 --- a/examples/features/tool-calls-template/.agentv/targets.yaml +++ /dev/null @@ -1,13 +0,0 @@ -targets: - - name: mock_agent - provider: cli - grader_target: grader - command: bun run ./mock-agent.ts --prompt {PROMPT} --output {OUTPUT_FILE} - cwd: .. - healthcheck: - command: bun run ./mock-agent.ts --healthcheck - cwd: .. - - - name: grader - provider: openrouter - model: anthropic/claude-sonnet-4 diff --git a/examples/features/tool-calls-template/evals/dataset.eval.yaml b/examples/features/tool-calls-template/evals/dataset.eval.yaml index 236abc2b..5d17a32c 100644 --- a/examples/features/tool-calls-template/evals/dataset.eval.yaml +++ b/examples/features/tool-calls-template/evals/dataset.eval.yaml @@ -11,8 +11,8 @@ # The LLM grader then inspects the tool call summary to verify skill usage. # # Setup: -# 1. Create examples/features/.env with your OPENROUTER_API_KEY -# 2. Run: cd examples/features && bun agentv eval tool-calls-template/evals/dataset.eval.yaml --target mock_agent +# 1. Create examples/features/.env with OPENROUTER_API_KEY and OPENROUTER_GRADER_MODEL +# 2. Run from repo root: bun agentv eval examples/features/tool-calls-template/evals/dataset.eval.yaml name: tool-calls-template description: LLM grader with {{ tool_calls }} template variable for skill verification @@ -36,7 +36,7 @@ tests: assertions: - name: skill-invoked type: llm-grader - prompt: ../prompts/skill-usage-check.md + prompt: file://../prompts/skill-usage-check.md # ========================================== # Example 2: Verify review-pr skill was triggered @@ -52,7 +52,7 @@ tests: assertions: - name: skill-invoked type: llm-grader - prompt: ../prompts/skill-usage-check.md + prompt: file://../prompts/skill-usage-check.md # ========================================== # Example 3: Verify NO skill was triggered (negative case) @@ -69,4 +69,4 @@ tests: assertions: - name: no-skill-used type: llm-grader - prompt: ../prompts/no-skill-check.md + prompt: file://../prompts/no-skill-check.md From 3374ef48019977ffcf0ca55ce484018a6f756992 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 16 Apr 2026 04:31:18 +0000 Subject: [PATCH 4/7] refactor(examples): use shared grader target and rename eval file - Remove openrouter_grader target, use shared grader (via GRADER_TARGET) - Rename dataset.eval.yaml to eval.yaml - Verified with both mock_agent (3/3 pass) and copilot (tool_calls populated) Co-Authored-By: Claude Opus 4.6 --- .agentv/targets.yaml | 7 +------ .../evals/{dataset.eval.yaml => eval.yaml} | 4 ++-- 2 files changed, 3 insertions(+), 8 deletions(-) rename examples/features/tool-calls-template/evals/{dataset.eval.yaml => eval.yaml} (95%) diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml index 3b46f5ce..fbcba765 100644 --- a/.agentv/targets.yaml +++ b/.agentv/targets.yaml @@ -120,16 +120,11 @@ targets: # ── Mock targets (for feature examples) ────────────────────────── - name: mock_agent provider: cli - grader_target: openrouter_grader + grader_target: grader command: bun run ../mock-agent.ts --prompt {PROMPT} --output {OUTPUT_FILE} healthcheck: command: bun run ../mock-agent.ts --healthcheck - - name: openrouter_grader - provider: openrouter - api_key: ${{ OPENROUTER_API_KEY }} - model: ${{ OPENROUTER_GRADER_MODEL }} - # ── LLM targets (direct model access) ───────────────────────────── - name: gh-models provider: openai diff --git a/examples/features/tool-calls-template/evals/dataset.eval.yaml b/examples/features/tool-calls-template/evals/eval.yaml similarity index 95% rename from examples/features/tool-calls-template/evals/dataset.eval.yaml rename to examples/features/tool-calls-template/evals/eval.yaml index 5d17a32c..d265b978 100644 --- a/examples/features/tool-calls-template/evals/dataset.eval.yaml +++ b/examples/features/tool-calls-template/evals/eval.yaml @@ -11,8 +11,8 @@ # The LLM grader then inspects the tool call summary to verify skill usage. # # Setup: -# 1. Create examples/features/.env with OPENROUTER_API_KEY and OPENROUTER_GRADER_MODEL -# 2. Run from repo root: bun agentv eval examples/features/tool-calls-template/evals/dataset.eval.yaml +# 1. Set GRADER_TARGET in your .env (e.g. openrouter) +# 2. Run from repo root: bun agentv eval examples/features/tool-calls-template/evals/eval.yaml name: tool-calls-template description: LLM grader with {{ tool_calls }} template variable for skill verification From ab0b74c4caa4073190807a8931075a9885094c13 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 16 Apr 2026 04:50:05 +0000 Subject: [PATCH 5/7] refactor(examples): use workspace template with skills for copilot e2e Replace mock CLI agent with real copilot-compatible workspace template containing acme-deploy skill in all provider directories. Verified 3/3 pass with copilot target (skill triggered, rollback triggered, no skill for unrelated). Remove mock_agent target from root targets.yaml. Co-Authored-By: Claude Opus 4.6 --- .agentv/targets.yaml | 8 -- .../tool-calls-template/evals/eval.yaml | 61 +++------ .../tool-calls-template/mock-agent.ts | 124 ------------------ .../prompts/no-skill-check.md | 3 - .../prompts/skill-usage-check.md | 7 +- .../.agents/skills/acme-deploy/SKILL.md | 49 +++++++ .../.claude/skills/acme-deploy/SKILL.md | 49 +++++++ .../.copilot/skills/acme-deploy/SKILL.md | 49 +++++++ .../.github/skills/acme-deploy/SKILL.md | 49 +++++++ 9 files changed, 214 insertions(+), 185 deletions(-) delete mode 100644 examples/features/tool-calls-template/mock-agent.ts create mode 100644 examples/features/tool-calls-template/workspace/.agents/skills/acme-deploy/SKILL.md create mode 100644 examples/features/tool-calls-template/workspace/.claude/skills/acme-deploy/SKILL.md create mode 100644 examples/features/tool-calls-template/workspace/.copilot/skills/acme-deploy/SKILL.md create mode 100644 examples/features/tool-calls-template/workspace/.github/skills/acme-deploy/SKILL.md diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml index fbcba765..067a75bf 100644 --- a/.agentv/targets.yaml +++ b/.agentv/targets.yaml @@ -117,14 +117,6 @@ targets: log_dir: ${{ CODEX_LOG_DIR }} log_format: json - # ── Mock targets (for feature examples) ────────────────────────── - - name: mock_agent - provider: cli - grader_target: grader - command: bun run ../mock-agent.ts --prompt {PROMPT} --output {OUTPUT_FILE} - healthcheck: - command: bun run ../mock-agent.ts --healthcheck - # ── LLM targets (direct model access) ───────────────────────────── - name: gh-models provider: openai diff --git a/examples/features/tool-calls-template/evals/eval.yaml b/examples/features/tool-calls-template/evals/eval.yaml index d265b978..030868ff 100644 --- a/examples/features/tool-calls-template/evals/eval.yaml +++ b/examples/features/tool-calls-template/evals/eval.yaml @@ -1,71 +1,42 @@ # Tool Calls Template Variable Demo # # Demonstrates using {{ tool_calls }} in LLM grader prompts to check -# whether an agent invoked the right skills during execution. +# whether an agent invoked the right skills during execution — without +# needing the skill-trigger evaluator. # -# The {{ tool_calls }} variable provides a formatted summary of all tool -# calls from the agent's output (tool name + key inputs). This lets LLM -# graders reason about tool usage without parsing raw Message[] data. +# The workspace template includes the acme-deploy skill in all provider +# directories (.claude/, .agents/, .copilot/, .github/) so any agent +# target can discover and use it. # -# This example uses a mock CLI agent that returns tool calls in its output. -# The LLM grader then inspects the tool call summary to verify skill usage. -# -# Setup: -# 1. Set GRADER_TARGET in your .env (e.g. openrouter) -# 2. Run from repo root: bun agentv eval examples/features/tool-calls-template/evals/eval.yaml +# Run: +# bun agentv eval examples/features/tool-calls-template/evals/eval.yaml --target copilot name: tool-calls-template description: LLM grader with {{ tool_calls }} template variable for skill verification +workspace: + template: ../workspace/ + execution: - target: mock_agent + workers: 1 tests: - # ========================================== - # Example 1: Verify deploy skill was triggered - # The mock agent calls Skill(acme-deploy) + Bash(kubectl) - # LLM grader checks tool_calls for Skill invocation - # ========================================== - id: deploy-skill-triggered - criteria: |- - The agent should invoke the acme-deploy skill to deploy the - payments-api service to production. - input: - - role: user - content: Deploy payments-api to production using the deploy skill. + input: How do I deploy payments-api to production? assertions: - name: skill-invoked type: llm-grader prompt: file://../prompts/skill-usage-check.md - # ========================================== - # Example 2: Verify review-pr skill was triggered - # The mock agent calls Skill(review-pr) + Read(files) - # LLM grader checks tool_calls for skill invocation - # ========================================== - - id: review-skill-triggered - criteria: |- - The agent should invoke the review-pr skill to review pull request #42. - input: - - role: user - content: Review pull request #42 using the review skill. + - id: rollback-skill-triggered + input: I need to roll back user-service in staging, what's the procedure? assertions: - name: skill-invoked type: llm-grader prompt: file://../prompts/skill-usage-check.md - # ========================================== - # Example 3: Verify NO skill was triggered (negative case) - # The mock agent only calls Read/Edit/Bash — no Skill tool - # LLM grader checks tool_calls to confirm absence of Skill - # ========================================== - - id: no-skill-for-bugfix - criteria: |- - The agent should fix the bug using basic tools (Read, Edit, Bash) - without invoking any skill. - input: - - role: user - content: Fix the null pointer bug in auth.ts. + - id: no-skill-for-unrelated + input: Write a Python function that parses JSON logs and extracts error messages. assertions: - name: no-skill-used type: llm-grader diff --git a/examples/features/tool-calls-template/mock-agent.ts b/examples/features/tool-calls-template/mock-agent.ts deleted file mode 100644 index 92d2eda0..00000000 --- a/examples/features/tool-calls-template/mock-agent.ts +++ /dev/null @@ -1,124 +0,0 @@ -#!/usr/bin/env bun -/** - * Mock Agent CLI for {{ tool_calls }} template variable demo. - * - * Simulates an agent that invokes skills and tools, returning tool call data - * in the output. Used to demonstrate LLM grader assertions that inspect - * tool calls via the {{ tool_calls }} template variable. - * - * Usage: - * bun run mock-agent.ts --prompt "..." --output output.json - * bun run mock-agent.ts --healthcheck - */ - -import { writeFileSync } from 'node:fs'; -import { parseArgs } from 'node:util'; - -interface ToolCall { - tool: string; - input?: unknown; - output?: unknown; -} - -interface Message { - role: 'assistant'; - content: string; - tool_calls?: ToolCall[]; -} - -interface AgentResponse { - output: Message[]; -} - -function generateResponse(prompt: string): AgentResponse { - const lower = prompt.toLowerCase(); - - // Scenario 1: Deploy request → triggers deploy skill - if (lower.includes('deploy')) { - return { - output: [ - { - role: 'assistant', - content: 'Deployment initiated for payments-api to production.', - tool_calls: [ - { - tool: 'Skill', - input: { skill: 'acme-deploy', args: '--service payments-api --env production' }, - }, - { tool: 'Bash', input: { command: 'kubectl rollout status deployment/payments-api' } }, - ], - }, - ], - }; - } - - // Scenario 2: Code review → triggers review-pr skill - if (lower.includes('review') || lower.includes('pull request')) { - return { - output: [ - { - role: 'assistant', - content: 'I reviewed the pull request and found no issues.', - tool_calls: [ - { tool: 'Skill', input: { skill: 'review-pr', args: '42' } }, - { tool: 'Read', input: { file_path: '/src/auth.ts' } }, - { tool: 'Read', input: { file_path: '/src/auth.test.ts' } }, - ], - }, - ], - }; - } - - // Scenario 3: File editing without skill invocation - if (lower.includes('fix') || lower.includes('bug')) { - return { - output: [ - { - role: 'assistant', - content: 'Fixed the null pointer bug in auth.ts.', - tool_calls: [ - { tool: 'Read', input: { file_path: '/src/auth.ts' } }, - { tool: 'Edit', input: { file_path: '/src/auth.ts' } }, - { tool: 'Bash', input: { command: 'npm test' } }, - ], - }, - ], - }; - } - - // Default: no tools - return { - output: [{ role: 'assistant', content: 'I processed your request.' }], - }; -} - -function main(): void { - const { values } = parseArgs({ - options: { - prompt: { type: 'string', short: 'p' }, - output: { type: 'string', short: 'o' }, - healthcheck: { type: 'boolean' }, - }, - allowPositionals: true, - }); - - if (values.healthcheck) { - console.log('OK'); - process.exit(0); - } - - if (!values.prompt || !values.output) { - console.error('Error: --prompt and --output are required'); - process.exit(1); - } - - const response = generateResponse(values.prompt); - writeFileSync(values.output, JSON.stringify(response, null, 2)); - - const firstMessage = response.output[0]; - if (firstMessage) { - console.log(firstMessage.content); - } -} - -main(); diff --git a/examples/features/tool-calls-template/prompts/no-skill-check.md b/examples/features/tool-calls-template/prompts/no-skill-check.md index c9e7f3f0..e79d40df 100644 --- a/examples/features/tool-calls-template/prompts/no-skill-check.md +++ b/examples/features/tool-calls-template/prompts/no-skill-check.md @@ -3,9 +3,6 @@ You are evaluating whether an AI agent completed a task WITHOUT using a skill. [[ ## question ## ]] {{ input }} -[[ ## criteria ## ]] -{{ criteria }} - [[ ## answer ## ]] {{ output }} diff --git a/examples/features/tool-calls-template/prompts/skill-usage-check.md b/examples/features/tool-calls-template/prompts/skill-usage-check.md index 0f79d674..828d21b4 100644 --- a/examples/features/tool-calls-template/prompts/skill-usage-check.md +++ b/examples/features/tool-calls-template/prompts/skill-usage-check.md @@ -1,15 +1,12 @@ -You are evaluating whether an AI agent correctly used the expected skill. +You are evaluating whether an AI agent invoked the expected skill. [[ ## question ## ]] {{ input }} -[[ ## criteria ## ]] -{{ criteria }} - [[ ## answer ## ]] {{ output }} [[ ## tool_calls ## ]] {{ tool_calls }} -Based on the tool calls above, determine whether the agent invoked the correct skill as required by the criteria. Score 1.0 if the expected skill was triggered with appropriate arguments, 0.0 if not. +Based on the tool calls above, determine whether the agent invoked the relevant skill to answer the question. Score 1.0 if a Skill tool was triggered, 0.0 if not. diff --git a/examples/features/tool-calls-template/workspace/.agents/skills/acme-deploy/SKILL.md b/examples/features/tool-calls-template/workspace/.agents/skills/acme-deploy/SKILL.md new file mode 100644 index 00000000..7ab31f68 --- /dev/null +++ b/examples/features/tool-calls-template/workspace/.agents/skills/acme-deploy/SKILL.md @@ -0,0 +1,49 @@ +--- +name: acme-deploy +description: Use when the user asks about deploying services, checking deployment status, rollback procedures, or release management at Acme Corp +--- + +# Acme Corp Deployment Procedures + +## Overview + +Internal deployment runbook for Acme Corp services. All deployments follow the Trident release pipeline. + +## Deployment Commands + +### Deploy to staging +```bash +trident push --env staging --service --tag +``` + +### Promote to production +```bash +trident promote --from staging --to prod --service --approval-ticket +``` +Production deploys require a JIRA approval ticket (prefix: DEPLOY-). + +### Rollback +```bash +trident rollback --env --service --to-version +``` +Rollbacks auto-notify #ops-alerts in Slack. + +### Check deployment status +```bash +trident status --env --service +``` + +## Service Registry + +| Service | Owner Team | Staging URL | Prod URL | +|---------|-----------|-------------|----------| +| payments-api | Platform | payments.staging.acme.internal | payments.acme.internal | +| user-service | Identity | users.staging.acme.internal | users.acme.internal | +| notifications | Engagement | notify.staging.acme.internal | notify.acme.internal | + +## Rules + +- All prod deploys require a DEPLOY- JIRA ticket +- Staging deploys are auto-approved during business hours (9am-5pm PT) +- Rollbacks bypass approval but require post-mortem within 48h +- Deploy freezes are announced in #engineering-announcements diff --git a/examples/features/tool-calls-template/workspace/.claude/skills/acme-deploy/SKILL.md b/examples/features/tool-calls-template/workspace/.claude/skills/acme-deploy/SKILL.md new file mode 100644 index 00000000..7ab31f68 --- /dev/null +++ b/examples/features/tool-calls-template/workspace/.claude/skills/acme-deploy/SKILL.md @@ -0,0 +1,49 @@ +--- +name: acme-deploy +description: Use when the user asks about deploying services, checking deployment status, rollback procedures, or release management at Acme Corp +--- + +# Acme Corp Deployment Procedures + +## Overview + +Internal deployment runbook for Acme Corp services. All deployments follow the Trident release pipeline. + +## Deployment Commands + +### Deploy to staging +```bash +trident push --env staging --service --tag +``` + +### Promote to production +```bash +trident promote --from staging --to prod --service --approval-ticket +``` +Production deploys require a JIRA approval ticket (prefix: DEPLOY-). + +### Rollback +```bash +trident rollback --env --service --to-version +``` +Rollbacks auto-notify #ops-alerts in Slack. + +### Check deployment status +```bash +trident status --env --service +``` + +## Service Registry + +| Service | Owner Team | Staging URL | Prod URL | +|---------|-----------|-------------|----------| +| payments-api | Platform | payments.staging.acme.internal | payments.acme.internal | +| user-service | Identity | users.staging.acme.internal | users.acme.internal | +| notifications | Engagement | notify.staging.acme.internal | notify.acme.internal | + +## Rules + +- All prod deploys require a DEPLOY- JIRA ticket +- Staging deploys are auto-approved during business hours (9am-5pm PT) +- Rollbacks bypass approval but require post-mortem within 48h +- Deploy freezes are announced in #engineering-announcements diff --git a/examples/features/tool-calls-template/workspace/.copilot/skills/acme-deploy/SKILL.md b/examples/features/tool-calls-template/workspace/.copilot/skills/acme-deploy/SKILL.md new file mode 100644 index 00000000..7ab31f68 --- /dev/null +++ b/examples/features/tool-calls-template/workspace/.copilot/skills/acme-deploy/SKILL.md @@ -0,0 +1,49 @@ +--- +name: acme-deploy +description: Use when the user asks about deploying services, checking deployment status, rollback procedures, or release management at Acme Corp +--- + +# Acme Corp Deployment Procedures + +## Overview + +Internal deployment runbook for Acme Corp services. All deployments follow the Trident release pipeline. + +## Deployment Commands + +### Deploy to staging +```bash +trident push --env staging --service --tag +``` + +### Promote to production +```bash +trident promote --from staging --to prod --service --approval-ticket +``` +Production deploys require a JIRA approval ticket (prefix: DEPLOY-). + +### Rollback +```bash +trident rollback --env --service --to-version +``` +Rollbacks auto-notify #ops-alerts in Slack. + +### Check deployment status +```bash +trident status --env --service +``` + +## Service Registry + +| Service | Owner Team | Staging URL | Prod URL | +|---------|-----------|-------------|----------| +| payments-api | Platform | payments.staging.acme.internal | payments.acme.internal | +| user-service | Identity | users.staging.acme.internal | users.acme.internal | +| notifications | Engagement | notify.staging.acme.internal | notify.acme.internal | + +## Rules + +- All prod deploys require a DEPLOY- JIRA ticket +- Staging deploys are auto-approved during business hours (9am-5pm PT) +- Rollbacks bypass approval but require post-mortem within 48h +- Deploy freezes are announced in #engineering-announcements diff --git a/examples/features/tool-calls-template/workspace/.github/skills/acme-deploy/SKILL.md b/examples/features/tool-calls-template/workspace/.github/skills/acme-deploy/SKILL.md new file mode 100644 index 00000000..7ab31f68 --- /dev/null +++ b/examples/features/tool-calls-template/workspace/.github/skills/acme-deploy/SKILL.md @@ -0,0 +1,49 @@ +--- +name: acme-deploy +description: Use when the user asks about deploying services, checking deployment status, rollback procedures, or release management at Acme Corp +--- + +# Acme Corp Deployment Procedures + +## Overview + +Internal deployment runbook for Acme Corp services. All deployments follow the Trident release pipeline. + +## Deployment Commands + +### Deploy to staging +```bash +trident push --env staging --service --tag +``` + +### Promote to production +```bash +trident promote --from staging --to prod --service --approval-ticket +``` +Production deploys require a JIRA approval ticket (prefix: DEPLOY-). + +### Rollback +```bash +trident rollback --env --service --to-version +``` +Rollbacks auto-notify #ops-alerts in Slack. + +### Check deployment status +```bash +trident status --env --service +``` + +## Service Registry + +| Service | Owner Team | Staging URL | Prod URL | +|---------|-----------|-------------|----------| +| payments-api | Platform | payments.staging.acme.internal | payments.acme.internal | +| user-service | Identity | users.staging.acme.internal | users.acme.internal | +| notifications | Engagement | notify.staging.acme.internal | notify.acme.internal | + +## Rules + +- All prod deploys require a DEPLOY- JIRA ticket +- Staging deploys are auto-approved during business hours (9am-5pm PT) +- Rollbacks bypass approval but require post-mortem within 48h +- Deploy freezes are announced in #engineering-announcements From 9859d586e2ef127e5c3cc6ee381c2a080bf70a30 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 16 Apr 2026 05:03:18 +0000 Subject: [PATCH 6/7] refactor(examples): single .agents skill + before_all hook + rubric assertions - Keep only .agents/skills/acme-deploy/SKILL.md as single source of truth - Add before_all hook to copy skills to .claude/skills/ in workspace - Switch from llm-grader with custom prompts to rubric assertions - Remove prompts/ directory and mock-agent.ts - Remove mock_agent target from root targets.yaml - Verified 3/3 pass with copilot at 100% Co-Authored-By: Claude Opus 4.6 --- .../tool-calls-template/evals/eval.yaml | 40 ++++++++------- .../prompts/no-skill-check.md | 12 ----- .../prompts/skill-usage-check.md | 12 ----- .../.claude/skills/acme-deploy/SKILL.md | 49 ------------------- .../.copilot/skills/acme-deploy/SKILL.md | 49 ------------------- .../.github/skills/acme-deploy/SKILL.md | 49 ------------------- 6 files changed, 21 insertions(+), 190 deletions(-) delete mode 100644 examples/features/tool-calls-template/prompts/no-skill-check.md delete mode 100644 examples/features/tool-calls-template/prompts/skill-usage-check.md delete mode 100644 examples/features/tool-calls-template/workspace/.claude/skills/acme-deploy/SKILL.md delete mode 100644 examples/features/tool-calls-template/workspace/.copilot/skills/acme-deploy/SKILL.md delete mode 100644 examples/features/tool-calls-template/workspace/.github/skills/acme-deploy/SKILL.md diff --git a/examples/features/tool-calls-template/evals/eval.yaml b/examples/features/tool-calls-template/evals/eval.yaml index 030868ff..ca8578bf 100644 --- a/examples/features/tool-calls-template/evals/eval.yaml +++ b/examples/features/tool-calls-template/evals/eval.yaml @@ -1,43 +1,45 @@ # Tool Calls Template Variable Demo # -# Demonstrates using {{ tool_calls }} in LLM grader prompts to check -# whether an agent invoked the right skills during execution — without -# needing the skill-trigger evaluator. +# Demonstrates using {{ tool_calls }} with rubric assertions to check +# whether an agent invoked the right skills — without needing the +# skill-trigger evaluator. # -# The workspace template includes the acme-deploy skill in all provider -# directories (.claude/, .agents/, .copilot/, .github/) so any agent -# target can discover and use it. +# Skills live in workspace/.agents/skills/. The before_all hook copies +# them to .claude/skills/ so copilot and other providers can discover them. # # Run: # bun agentv eval examples/features/tool-calls-template/evals/eval.yaml --target copilot name: tool-calls-template -description: LLM grader with {{ tool_calls }} template variable for skill verification +description: Rubric assertions with {{ tool_calls }} for skill verification workspace: template: ../workspace/ - -execution: - workers: 1 + hooks: + before_all: + command: + - bash + - -c + - 'WS=$(python3 -c "import json,sys;print(json.load(sys.stdin)[\"workspace_path\"])") && mkdir -p "$WS/.claude" && cp -r "$WS/.agents/skills" "$WS/.claude/skills"' tests: - id: deploy-skill-triggered input: How do I deploy payments-api to production? assertions: - - name: skill-invoked - type: llm-grader - prompt: file://../prompts/skill-usage-check.md + - type: rubrics + criteria: + - The agent invoked the acme-deploy skill - id: rollback-skill-triggered input: I need to roll back user-service in staging, what's the procedure? assertions: - - name: skill-invoked - type: llm-grader - prompt: file://../prompts/skill-usage-check.md + - type: rubrics + criteria: + - The agent invoked the acme-deploy skill - id: no-skill-for-unrelated input: Write a Python function that parses JSON logs and extracts error messages. assertions: - - name: no-skill-used - type: llm-grader - prompt: file://../prompts/no-skill-check.md + - type: rubrics + criteria: + - The tool_calls section does not contain any entry starting with "Skill:" (file creation, Read, Edit, and Bash are fine) diff --git a/examples/features/tool-calls-template/prompts/no-skill-check.md b/examples/features/tool-calls-template/prompts/no-skill-check.md deleted file mode 100644 index e79d40df..00000000 --- a/examples/features/tool-calls-template/prompts/no-skill-check.md +++ /dev/null @@ -1,12 +0,0 @@ -You are evaluating whether an AI agent completed a task WITHOUT using a skill. - -[[ ## question ## ]] -{{ input }} - -[[ ## answer ## ]] -{{ output }} - -[[ ## tool_calls ## ]] -{{ tool_calls }} - -Based on the tool calls above, determine whether the agent completed the task using only basic tools (Read, Edit, Bash, etc.) without invoking any Skill tool. Score 1.0 if no Skill tool was used, 0.0 if a Skill tool was invoked. diff --git a/examples/features/tool-calls-template/prompts/skill-usage-check.md b/examples/features/tool-calls-template/prompts/skill-usage-check.md deleted file mode 100644 index 828d21b4..00000000 --- a/examples/features/tool-calls-template/prompts/skill-usage-check.md +++ /dev/null @@ -1,12 +0,0 @@ -You are evaluating whether an AI agent invoked the expected skill. - -[[ ## question ## ]] -{{ input }} - -[[ ## answer ## ]] -{{ output }} - -[[ ## tool_calls ## ]] -{{ tool_calls }} - -Based on the tool calls above, determine whether the agent invoked the relevant skill to answer the question. Score 1.0 if a Skill tool was triggered, 0.0 if not. diff --git a/examples/features/tool-calls-template/workspace/.claude/skills/acme-deploy/SKILL.md b/examples/features/tool-calls-template/workspace/.claude/skills/acme-deploy/SKILL.md deleted file mode 100644 index 7ab31f68..00000000 --- a/examples/features/tool-calls-template/workspace/.claude/skills/acme-deploy/SKILL.md +++ /dev/null @@ -1,49 +0,0 @@ ---- -name: acme-deploy -description: Use when the user asks about deploying services, checking deployment status, rollback procedures, or release management at Acme Corp ---- - -# Acme Corp Deployment Procedures - -## Overview - -Internal deployment runbook for Acme Corp services. All deployments follow the Trident release pipeline. - -## Deployment Commands - -### Deploy to staging -```bash -trident push --env staging --service --tag -``` - -### Promote to production -```bash -trident promote --from staging --to prod --service --approval-ticket -``` -Production deploys require a JIRA approval ticket (prefix: DEPLOY-). - -### Rollback -```bash -trident rollback --env --service --to-version -``` -Rollbacks auto-notify #ops-alerts in Slack. - -### Check deployment status -```bash -trident status --env --service -``` - -## Service Registry - -| Service | Owner Team | Staging URL | Prod URL | -|---------|-----------|-------------|----------| -| payments-api | Platform | payments.staging.acme.internal | payments.acme.internal | -| user-service | Identity | users.staging.acme.internal | users.acme.internal | -| notifications | Engagement | notify.staging.acme.internal | notify.acme.internal | - -## Rules - -- All prod deploys require a DEPLOY- JIRA ticket -- Staging deploys are auto-approved during business hours (9am-5pm PT) -- Rollbacks bypass approval but require post-mortem within 48h -- Deploy freezes are announced in #engineering-announcements diff --git a/examples/features/tool-calls-template/workspace/.copilot/skills/acme-deploy/SKILL.md b/examples/features/tool-calls-template/workspace/.copilot/skills/acme-deploy/SKILL.md deleted file mode 100644 index 7ab31f68..00000000 --- a/examples/features/tool-calls-template/workspace/.copilot/skills/acme-deploy/SKILL.md +++ /dev/null @@ -1,49 +0,0 @@ ---- -name: acme-deploy -description: Use when the user asks about deploying services, checking deployment status, rollback procedures, or release management at Acme Corp ---- - -# Acme Corp Deployment Procedures - -## Overview - -Internal deployment runbook for Acme Corp services. All deployments follow the Trident release pipeline. - -## Deployment Commands - -### Deploy to staging -```bash -trident push --env staging --service --tag -``` - -### Promote to production -```bash -trident promote --from staging --to prod --service --approval-ticket -``` -Production deploys require a JIRA approval ticket (prefix: DEPLOY-). - -### Rollback -```bash -trident rollback --env --service --to-version -``` -Rollbacks auto-notify #ops-alerts in Slack. - -### Check deployment status -```bash -trident status --env --service -``` - -## Service Registry - -| Service | Owner Team | Staging URL | Prod URL | -|---------|-----------|-------------|----------| -| payments-api | Platform | payments.staging.acme.internal | payments.acme.internal | -| user-service | Identity | users.staging.acme.internal | users.acme.internal | -| notifications | Engagement | notify.staging.acme.internal | notify.acme.internal | - -## Rules - -- All prod deploys require a DEPLOY- JIRA ticket -- Staging deploys are auto-approved during business hours (9am-5pm PT) -- Rollbacks bypass approval but require post-mortem within 48h -- Deploy freezes are announced in #engineering-announcements diff --git a/examples/features/tool-calls-template/workspace/.github/skills/acme-deploy/SKILL.md b/examples/features/tool-calls-template/workspace/.github/skills/acme-deploy/SKILL.md deleted file mode 100644 index 7ab31f68..00000000 --- a/examples/features/tool-calls-template/workspace/.github/skills/acme-deploy/SKILL.md +++ /dev/null @@ -1,49 +0,0 @@ ---- -name: acme-deploy -description: Use when the user asks about deploying services, checking deployment status, rollback procedures, or release management at Acme Corp ---- - -# Acme Corp Deployment Procedures - -## Overview - -Internal deployment runbook for Acme Corp services. All deployments follow the Trident release pipeline. - -## Deployment Commands - -### Deploy to staging -```bash -trident push --env staging --service --tag -``` - -### Promote to production -```bash -trident promote --from staging --to prod --service --approval-ticket -``` -Production deploys require a JIRA approval ticket (prefix: DEPLOY-). - -### Rollback -```bash -trident rollback --env --service --to-version -``` -Rollbacks auto-notify #ops-alerts in Slack. - -### Check deployment status -```bash -trident status --env --service -``` - -## Service Registry - -| Service | Owner Team | Staging URL | Prod URL | -|---------|-----------|-------------|----------| -| payments-api | Platform | payments.staging.acme.internal | payments.acme.internal | -| user-service | Identity | users.staging.acme.internal | users.acme.internal | -| notifications | Engagement | notify.staging.acme.internal | notify.acme.internal | - -## Rules - -- All prod deploys require a DEPLOY- JIRA ticket -- Staging deploys are auto-approved during business hours (9am-5pm PT) -- Rollbacks bypass approval but require post-mortem within 48h -- Deploy freezes are announced in #engineering-announcements From 8a201c39c73e49e979c9dbd56f41c8312eb6c753 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Thu, 16 Apr 2026 05:25:02 +0000 Subject: [PATCH 7/7] docs: add tool_calls context section to rubrics docs + flatten example assertions Add "Context Available to Rubric Graders" section to rubrics.mdx documenting that rubric assertions receive tool_calls and file_changes context. Flatten example eval assertions from `type: rubrics` with `criteria:` to plain string shorthand. Co-Authored-By: Claude Opus 4.6 --- .../content/docs/docs/evaluation/rubrics.mdx | 17 +++++++++++++++++ .../tool-calls-template/evals/eval.yaml | 12 +++--------- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/apps/web/src/content/docs/docs/evaluation/rubrics.mdx b/apps/web/src/content/docs/docs/evaluation/rubrics.mdx index 4f9f5587..14759e79 100644 --- a/apps/web/src/content/docs/docs/evaluation/rubrics.mdx +++ b/apps/web/src/content/docs/docs/evaluation/rubrics.mdx @@ -122,6 +122,23 @@ score = sum(criterion_score / 10 * weight) / sum(total_weights) Write rubric criteria directly in `assertions`. If you want help choosing between plain assertions, deterministic graders, and rubric or LLM-based grading, use the `agentv-eval-writer` skill. Keep the grader choice driven by the criteria rather than one fixed recipe. +## Context Available to Rubric Graders + +Rubric assertions automatically receive the full evaluation context, not just the agent's text answer. When present, the following are appended to the grader prompt: + +- **`file_changes`** — unified diff of workspace file changes (when `workspace` is configured) +- **`tool_calls`** — formatted summary of tool calls from agent execution (tool name + key inputs) + +This means rubric criteria can reason about *what the agent did*, not only what it said. For example, you can check whether an agent invoked a specific skill: + +```yaml +assertions: + - The agent invoked the acme-deploy skill + - The agent used Read to inspect the config file before editing +``` + +This is a lightweight alternative to the `skill-trigger` evaluator when you want to check tool usage with natural-language criteria. + ## Combining with Other Graders Rubrics work alongside code and LLM graders: diff --git a/examples/features/tool-calls-template/evals/eval.yaml b/examples/features/tool-calls-template/evals/eval.yaml index ca8578bf..18976db7 100644 --- a/examples/features/tool-calls-template/evals/eval.yaml +++ b/examples/features/tool-calls-template/evals/eval.yaml @@ -26,20 +26,14 @@ tests: - id: deploy-skill-triggered input: How do I deploy payments-api to production? assertions: - - type: rubrics - criteria: - - The agent invoked the acme-deploy skill + - The agent invoked the acme-deploy skill - id: rollback-skill-triggered input: I need to roll back user-service in staging, what's the procedure? assertions: - - type: rubrics - criteria: - - The agent invoked the acme-deploy skill + - The agent invoked the acme-deploy skill - id: no-skill-for-unrelated input: Write a Python function that parses JSON logs and extracts error messages. assertions: - - type: rubrics - criteria: - - The tool_calls section does not contain any entry starting with "Skill:" (file creation, Read, Edit, and Bash are fine) + - The tool_calls section does not contain any entry starting with "Skill:" (file creation, Read, Edit, and Bash are fine)