diff --git a/apps/web/src/content/docs/docs/evaluation/rubrics.mdx b/apps/web/src/content/docs/docs/evaluation/rubrics.mdx index 4f9f5587e..14759e797 100644 --- a/apps/web/src/content/docs/docs/evaluation/rubrics.mdx +++ b/apps/web/src/content/docs/docs/evaluation/rubrics.mdx @@ -122,6 +122,23 @@ score = sum(criterion_score / 10 * weight) / sum(total_weights) Write rubric criteria directly in `assertions`. If you want help choosing between plain assertions, deterministic graders, and rubric or LLM-based grading, use the `agentv-eval-writer` skill. Keep the grader choice driven by the criteria rather than one fixed recipe. +## Context Available to Rubric Graders + +Rubric assertions automatically receive the full evaluation context, not just the agent's text answer. When present, the following are appended to the grader prompt: + +- **`file_changes`** — unified diff of workspace file changes (when `workspace` is configured) +- **`tool_calls`** — formatted summary of tool calls from agent execution (tool name + key inputs) + +This means rubric criteria can reason about *what the agent did*, not only what it said. For example, you can check whether an agent invoked a specific skill: + +```yaml +assertions: + - The agent invoked the acme-deploy skill + - The agent used Read to inspect the config file before editing +``` + +This is a lightweight alternative to the `skill-trigger` evaluator when you want to check tool usage with natural-language criteria. + ## Combining with Other Graders Rubrics work alongside code and LLM graders: diff --git a/apps/web/src/content/docs/docs/graders/llm-graders.mdx b/apps/web/src/content/docs/docs/graders/llm-graders.mdx index 3f9cd969c..14f88ba6a 100644 --- a/apps/web/src/content/docs/docs/graders/llm-graders.mdx +++ b/apps/web/src/content/docs/docs/graders/llm-graders.mdx @@ -73,6 +73,7 @@ Score the response from 0.0 to 1.0 based on: | `expected_output` | Full resolved expected array, JSON-serialized | | `output` | Full provider output array, JSON-serialized | | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) | +| `tool_calls` | Formatted summary of tool calls from agent execution (tool name + key inputs per call) | ## Per-Grader Target @@ -228,6 +229,7 @@ Derived strings injected into grader prompts: | `expected_output` | Full resolved expected array, JSON-serialized | | `output` | Full provider output array, JSON-serialized | | `file_changes` | Unified diff of workspace file changes (populated when `workspace` is configured) | +| `tool_calls` | Formatted summary of tool calls from agent execution (tool name + key inputs per call) | **Example flow:** diff --git a/examples/features/tool-calls-template/evals/eval.yaml b/examples/features/tool-calls-template/evals/eval.yaml new file mode 100644 index 000000000..18976db70 --- /dev/null +++ b/examples/features/tool-calls-template/evals/eval.yaml @@ -0,0 +1,39 @@ +# Tool Calls Template Variable Demo +# +# Demonstrates using {{ tool_calls }} with rubric assertions to check +# whether an agent invoked the right skills — without needing the +# skill-trigger evaluator. +# +# Skills live in workspace/.agents/skills/. The before_all hook copies +# them to .claude/skills/ so copilot and other providers can discover them. +# +# Run: +# bun agentv eval examples/features/tool-calls-template/evals/eval.yaml --target copilot + +name: tool-calls-template +description: Rubric assertions with {{ tool_calls }} for skill verification + +workspace: + template: ../workspace/ + hooks: + before_all: + command: + - bash + - -c + - 'WS=$(python3 -c "import json,sys;print(json.load(sys.stdin)[\"workspace_path\"])") && mkdir -p "$WS/.claude" && cp -r "$WS/.agents/skills" "$WS/.claude/skills"' + +tests: + - id: deploy-skill-triggered + input: How do I deploy payments-api to production? + assertions: + - The agent invoked the acme-deploy skill + + - id: rollback-skill-triggered + input: I need to roll back user-service in staging, what's the procedure? + assertions: + - The agent invoked the acme-deploy skill + + - id: no-skill-for-unrelated + input: Write a Python function that parses JSON logs and extracts error messages. + assertions: + - The tool_calls section does not contain any entry starting with "Skill:" (file creation, Read, Edit, and Bash are fine) diff --git a/examples/features/tool-calls-template/workspace/.agents/skills/acme-deploy/SKILL.md b/examples/features/tool-calls-template/workspace/.agents/skills/acme-deploy/SKILL.md new file mode 100644 index 000000000..7ab31f686 --- /dev/null +++ b/examples/features/tool-calls-template/workspace/.agents/skills/acme-deploy/SKILL.md @@ -0,0 +1,49 @@ +--- +name: acme-deploy +description: Use when the user asks about deploying services, checking deployment status, rollback procedures, or release management at Acme Corp +--- + +# Acme Corp Deployment Procedures + +## Overview + +Internal deployment runbook for Acme Corp services. All deployments follow the Trident release pipeline. + +## Deployment Commands + +### Deploy to staging +```bash +trident push --env staging --service --tag +``` + +### Promote to production +```bash +trident promote --from staging --to prod --service --approval-ticket +``` +Production deploys require a JIRA approval ticket (prefix: DEPLOY-). + +### Rollback +```bash +trident rollback --env --service --to-version +``` +Rollbacks auto-notify #ops-alerts in Slack. + +### Check deployment status +```bash +trident status --env --service +``` + +## Service Registry + +| Service | Owner Team | Staging URL | Prod URL | +|---------|-----------|-------------|----------| +| payments-api | Platform | payments.staging.acme.internal | payments.acme.internal | +| user-service | Identity | users.staging.acme.internal | users.acme.internal | +| notifications | Engagement | notify.staging.acme.internal | notify.acme.internal | + +## Rules + +- All prod deploys require a DEPLOY- JIRA ticket +- Staging deploys are auto-approved during business hours (9am-5pm PT) +- Rollbacks bypass approval but require post-mortem within 48h +- Deploy freezes are announced in #engineering-announcements diff --git a/packages/core/src/evaluation/graders/format-tool-calls.ts b/packages/core/src/evaluation/graders/format-tool-calls.ts new file mode 100644 index 000000000..148df281d --- /dev/null +++ b/packages/core/src/evaluation/graders/format-tool-calls.ts @@ -0,0 +1,79 @@ +/** + * Formats tool calls from agent output messages into a human-readable summary. + * + * Used by `{{ tool_calls }}` template variable in LLM grader prompts. + * Extracts key input fields per tool to keep the summary compact: + * - Skill: `skill` arg + * - Read/Write/Edit: `file_path` + * - Bash: `command` + * - Grep/Glob: `pattern` + * - Other tools: first string-valued input field (if any) + * + * Returns empty string when there are no tool calls (template variable resolves to ''). + */ + +import type { Message } from '../providers/types.js'; + +/** + * Key input fields to extract per tool name. + * Order matters — first matching field wins. + */ +const KEY_INPUT_FIELDS: ReadonlyMap = new Map([ + ['Skill', ['skill']], + ['Read', ['file_path']], + ['Write', ['file_path']], + ['Edit', ['file_path']], + ['Bash', ['command']], + ['Grep', ['pattern']], + ['Glob', ['pattern']], +]); + +/** Fallback: pick the first short string-valued field from input. */ +const MAX_FALLBACK_LENGTH = 120; + +export function formatToolCalls(output: readonly Message[] | undefined): string { + if (!output) return ''; + + const lines: string[] = []; + + for (const message of output) { + if (!message.toolCalls) continue; + for (const call of message.toolCalls) { + const toolName = call.tool ?? 'unknown'; + const detail = extractKeyDetail(toolName, call.input); + lines.push(detail ? `- ${toolName}: ${detail}` : `- ${toolName}`); + } + } + + return lines.length > 0 ? lines.join('\n') : ''; +} + +function extractKeyDetail(toolName: string, input: unknown): string { + if (!input || typeof input !== 'object') return ''; + const record = input as Record; + + // Try known key fields for this tool + const knownFields = KEY_INPUT_FIELDS.get(toolName); + if (knownFields) { + for (const field of knownFields) { + const value = record[field]; + if (typeof value === 'string' && value.length > 0) { + return truncate(value); + } + } + } + + // Fallback: first short string-valued field + for (const value of Object.values(record)) { + if (typeof value === 'string' && value.length > 0 && value.length <= MAX_FALLBACK_LENGTH) { + return truncate(value); + } + } + + return ''; +} + +function truncate(value: string, maxLen = 120): string { + if (value.length <= maxLen) return value; + return `${value.slice(0, maxLen)}…`; +} diff --git a/packages/core/src/evaluation/graders/index.ts b/packages/core/src/evaluation/graders/index.ts index 107582aee..c87ff4a59 100644 --- a/packages/core/src/evaluation/graders/index.ts +++ b/packages/core/src/evaluation/graders/index.ts @@ -55,6 +55,8 @@ export { } from './llm-grader.js'; export type { LlmGraderOptions } from './llm-grader.js'; +export { formatToolCalls } from './format-tool-calls.js'; + export { SkillTriggerGrader } from './skill-trigger.js'; export { assembleLlmGraderPrompt } from './llm-grader-prompt.js'; diff --git a/packages/core/src/evaluation/graders/llm-grader-prompt.ts b/packages/core/src/evaluation/graders/llm-grader-prompt.ts index 1cc7774bb..fc50aae4a 100644 --- a/packages/core/src/evaluation/graders/llm-grader-prompt.ts +++ b/packages/core/src/evaluation/graders/llm-grader-prompt.ts @@ -24,6 +24,7 @@ export function assembleLlmGraderPrompt(input: { evaluatorConfig?: LlmGraderConfig; output?: readonly Message[]; fileChanges?: string; + toolCalls?: string; graderTemplateOverride?: string; }): LlmGraderPromptAssembly { const { @@ -32,6 +33,7 @@ export function assembleLlmGraderPrompt(input: { promptInputs, evaluatorConfig, fileChanges, + toolCalls, graderTemplateOverride, } = input; @@ -41,12 +43,19 @@ export function assembleLlmGraderPrompt(input: { if (rubrics && rubrics.length > 0) { const hasScoreRanges = rubrics.some((r) => r.score_ranges && r.score_ranges.length > 0); if (hasScoreRanges) { - return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges); + return assembleScoreRange(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls); } - return assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges); + return assembleChecklist(evalCase, candidate, promptInputs, rubrics, fileChanges, toolCalls); } - return assembleFreeform(evalCase, candidate, promptInputs, fileChanges, graderTemplateOverride); + return assembleFreeform( + evalCase, + candidate, + promptInputs, + fileChanges, + toolCalls, + graderTemplateOverride, + ); } function assembleFreeform( @@ -54,6 +63,7 @@ function assembleFreeform( candidate: string, promptInputs: PromptInputs, fileChanges?: string, + toolCalls?: string, graderTemplateOverride?: string, ): LlmGraderPromptAssembly { const formattedQuestion = @@ -67,6 +77,7 @@ function assembleFreeform( [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (evalCase.reference_answer ?? '').trim(), [TEMPLATE_VARIABLES.CRITERIA]: evalCase.criteria.trim(), [TEMPLATE_VARIABLES.FILE_CHANGES]: fileChanges ?? '', + [TEMPLATE_VARIABLES.TOOL_CALLS]: toolCalls ?? '', // Deprecated aliases [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), [TEMPLATE_VARIABLES.OUTPUT_TEXT]: candidate.trim(), @@ -77,10 +88,13 @@ function assembleFreeform( const template = graderTemplateOverride ?? DEFAULT_GRADER_TEMPLATE; let userPrompt = substituteVariables(template, variables); - // Append file_changes section to default template only when present + // Append file_changes and tool_calls sections to default template only when present if (fileChanges && !graderTemplateOverride) { userPrompt += `\n\n[[ ## file_changes ## ]]\n${fileChanges}`; } + if (toolCalls && !graderTemplateOverride) { + userPrompt += `\n\n[[ ## tool_calls ## ]]\n${toolCalls}`; + } return { systemPrompt, @@ -96,6 +110,7 @@ function assembleChecklist( promptInputs: PromptInputs, rubrics: readonly RubricItem[], fileChanges?: string, + toolCalls?: string, ): LlmGraderPromptAssembly { const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 @@ -123,6 +138,10 @@ function assembleChecklist( parts.push('[[ ## file_changes ## ]]', fileChanges, ''); } + if (toolCalls) { + parts.push('[[ ## tool_calls ## ]]', toolCalls, ''); + } + parts.push('[[ ## rubrics ## ]]'); for (const rubric of rubrics) { @@ -150,6 +169,7 @@ function assembleScoreRange( promptInputs: PromptInputs, rubrics: readonly RubricItem[], fileChanges?: string, + toolCalls?: string, ): LlmGraderPromptAssembly { const formattedQuestion = promptInputs.question && promptInputs.question.trim().length > 0 @@ -178,6 +198,10 @@ function assembleScoreRange( parts.push('[[ ## file_changes ## ]]', fileChanges, ''); } + if (toolCalls) { + parts.push('[[ ## tool_calls ## ]]', toolCalls, ''); + } + parts.push('[[ ## scoring_criteria ## ]]'); for (const rubric of rubrics) { diff --git a/packages/core/src/evaluation/graders/llm-grader.ts b/packages/core/src/evaluation/graders/llm-grader.ts index 47812ef47..15e41ab99 100644 --- a/packages/core/src/evaluation/graders/llm-grader.ts +++ b/packages/core/src/evaluation/graders/llm-grader.ts @@ -272,6 +272,7 @@ export class LlmGrader implements Grader { [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(), [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', + [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? '', // Deprecated aliases — same values as the primary variables above [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(), @@ -290,10 +291,13 @@ export class LlmGrader implements Grader { let userPrompt = substituteVariables(graderTemplate, variables); - // Append file_changes section to default template only when present + // Append file_changes and tool_calls sections to default template only when present if (context.fileChanges && !context.graderTemplateOverride && !this.graderTemplate) { userPrompt += `\n\n[[ ## file_changes ## ]]\n${context.fileChanges}`; } + if (context.toolCalls && !context.graderTemplateOverride && !this.graderTemplate) { + userPrompt += `\n\n[[ ## tool_calls ## ]]\n${context.toolCalls}`; + } const graderRawRequest: JsonObject = { userPrompt, @@ -691,6 +695,7 @@ export class LlmGrader implements Grader { [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(), [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(), [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', + [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? '', // Deprecated aliases [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(), @@ -726,6 +731,10 @@ export class LlmGrader implements Grader { parts.push('[[ ## file_changes ## ]]', context.fileChanges, ''); } + if (context.toolCalls) { + parts.push('[[ ## tool_calls ## ]]', context.toolCalls, ''); + } + if (rubrics && rubrics.length > 0) { parts.push('[[ ## rubrics ## ]]'); for (const rubric of rubrics) { @@ -766,6 +775,7 @@ export class LlmGrader implements Grader { [TEMPLATE_VARIABLES.OUTPUT]: context.candidate.trim(), [TEMPLATE_VARIABLES.EXPECTED_OUTPUT]: (context.evalCase.reference_answer ?? '').trim(), [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', + [TEMPLATE_VARIABLES.TOOL_CALLS]: context.toolCalls ?? '', // Deprecated aliases [TEMPLATE_VARIABLES.INPUT_TEXT]: formattedQuestion.trim(), [TEMPLATE_VARIABLES.OUTPUT_TEXT]: context.candidate.trim(), @@ -801,6 +811,10 @@ export class LlmGrader implements Grader { parts.push('[[ ## file_changes ## ]]', context.fileChanges, ''); } + if (context.toolCalls) { + parts.push('[[ ## tool_calls ## ]]', context.toolCalls, ''); + } + if (rubrics && rubrics.length > 0) { parts.push('[[ ## rubrics ## ]]'); for (const rubric of rubrics) { @@ -923,6 +937,10 @@ export class LlmGrader implements Grader { parts.push('[[ ## file_changes ## ]]', context.fileChanges, ''); } + if (context.toolCalls) { + parts.push('[[ ## tool_calls ## ]]', context.toolCalls, ''); + } + parts.push('[[ ## scoring_criteria ## ]]'); for (const rubric of rubrics) { @@ -985,6 +1003,10 @@ export class LlmGrader implements Grader { parts.push('[[ ## file_changes ## ]]', context.fileChanges, ''); } + if (context.toolCalls) { + parts.push('[[ ## tool_calls ## ]]', context.toolCalls, ''); + } + parts.push('[[ ## rubrics ## ]]'); for (const rubric of rubrics) { diff --git a/packages/core/src/evaluation/graders/types.ts b/packages/core/src/evaluation/graders/types.ts index 14e4fd445..1d548e5f9 100644 --- a/packages/core/src/evaluation/graders/types.ts +++ b/packages/core/src/evaluation/graders/types.ts @@ -55,6 +55,8 @@ export interface EvaluationContext { readonly availableTargets?: readonly string[]; /** Unified diff of file changes from workspace */ readonly fileChanges?: string; + /** Formatted summary of tool calls from agent execution */ + readonly toolCalls?: string; /** Absolute path to the workspace directory */ readonly workspacePath?: string; /** Docker workspace config: when present, code-grader commands run inside a container */ diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 6a1a3a0c4..1e4b24e12 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -15,6 +15,7 @@ import { type EvaluationScore, type Grader, LlmGrader, + formatToolCalls, negateScore, scoreToVerdict, } from './graders.js'; @@ -2271,6 +2272,9 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise Provider | undefined; readonly availableTargets?: readonly string[]; readonly fileChanges?: string; + readonly toolCalls?: string; readonly workspacePath?: string; readonly dockerConfig?: import('./types.js').DockerWorkspaceConfig; readonly verbose?: boolean; @@ -2659,6 +2665,7 @@ async function evaluateCandidate(options: { targetResolver, availableTargets, fileChanges, + toolCalls, workspacePath, dockerConfig, threshold: evalThreshold, @@ -2688,6 +2695,7 @@ async function evaluateCandidate(options: { targetResolver, availableTargets, fileChanges, + toolCalls, workspacePath, dockerConfig, threshold: evalThreshold, @@ -2775,6 +2783,7 @@ async function runEvaluatorsForCase(options: { readonly targetResolver?: (name: string) => Provider | undefined; readonly availableTargets?: readonly string[]; readonly fileChanges?: string; + readonly toolCalls?: string; readonly workspacePath?: string; readonly dockerConfig?: import('./types.js').DockerWorkspaceConfig; readonly threshold?: number; @@ -2802,6 +2811,7 @@ async function runEvaluatorsForCase(options: { targetResolver, availableTargets, fileChanges, + toolCalls, workspacePath, dockerConfig, threshold, @@ -2832,6 +2842,7 @@ async function runEvaluatorsForCase(options: { targetResolver, availableTargets, fileChanges, + toolCalls, workspacePath, dockerConfig, threshold, @@ -2868,6 +2879,7 @@ async function runEvaluatorsForCase(options: { targetResolver, availableTargets, fileChanges, + toolCalls, workspacePath, dockerConfig, dependencyResults, @@ -2914,6 +2926,7 @@ async function runEvaluatorList(options: { readonly targetResolver?: (name: string) => Provider | undefined; readonly availableTargets?: readonly string[]; readonly fileChanges?: string; + readonly toolCalls?: string; readonly workspacePath?: string; readonly dockerConfig?: import('./types.js').DockerWorkspaceConfig; readonly threshold?: number; @@ -2942,6 +2955,7 @@ async function runEvaluatorList(options: { targetResolver, availableTargets, fileChanges, + toolCalls, workspacePath, dockerConfig, dependencyResults, @@ -2977,6 +2991,7 @@ async function runEvaluatorList(options: { targetResolver, availableTargets, fileChanges, + toolCalls, workspacePath, dockerConfig, dependencyResults, diff --git a/packages/core/src/evaluation/template-variables.ts b/packages/core/src/evaluation/template-variables.ts index 508d837db..9d92f0d87 100644 --- a/packages/core/src/evaluation/template-variables.ts +++ b/packages/core/src/evaluation/template-variables.ts @@ -8,6 +8,7 @@ * - {{ expected_output }} — reference answer as plain text * - {{ criteria }} — evaluation criteria string * - {{ file_changes }} — file diff (if available) + * - {{ tool_calls }} — formatted summary of tool calls from agent execution * * Deprecated aliases (emit a warning when used in custom templates): * - {{ input_text }} → use {{ input }} @@ -20,6 +21,7 @@ export const TEMPLATE_VARIABLES = { INPUT: 'input', OUTPUT: 'output', FILE_CHANGES: 'file_changes', + TOOL_CALLS: 'tool_calls', /** @deprecated Use INPUT instead — resolves to the same text value. */ INPUT_TEXT: 'input_text', /** @deprecated Use OUTPUT instead — resolves to the same text value. */ diff --git a/packages/core/test/evaluation/graders/format-tool-calls.test.ts b/packages/core/test/evaluation/graders/format-tool-calls.test.ts new file mode 100644 index 000000000..6ce1122c5 --- /dev/null +++ b/packages/core/test/evaluation/graders/format-tool-calls.test.ts @@ -0,0 +1,132 @@ +import { describe, expect, it } from 'vitest'; +import { formatToolCalls } from '../../../src/evaluation/graders/format-tool-calls.js'; +import type { Message } from '../../../src/evaluation/providers/types.js'; + +describe('formatToolCalls', () => { + it('returns empty string for undefined output', () => { + expect(formatToolCalls(undefined)).toBe(''); + }); + + it('returns empty string for empty messages array', () => { + expect(formatToolCalls([])).toBe(''); + }); + + it('returns empty string when no messages have tool calls', () => { + const messages: Message[] = [ + { role: 'assistant', content: 'Hello' }, + { role: 'user', content: 'Hi' }, + ]; + expect(formatToolCalls(messages)).toBe(''); + }); + + it('formats Skill tool calls with skill name', () => { + const messages: Message[] = [ + { + role: 'assistant', + toolCalls: [{ tool: 'Skill', input: { skill: 'commit' } }], + }, + ]; + expect(formatToolCalls(messages)).toBe('- Skill: commit'); + }); + + it('formats Read/Write/Edit tool calls with file_path', () => { + const messages: Message[] = [ + { + role: 'assistant', + toolCalls: [ + { tool: 'Read', input: { file_path: '/src/index.ts' } }, + { tool: 'Write', input: { file_path: '/src/output.ts', content: '...' } }, + { tool: 'Edit', input: { file_path: '/src/edit.ts', old_string: 'a', new_string: 'b' } }, + ], + }, + ]; + const result = formatToolCalls(messages); + expect(result).toBe('- Read: /src/index.ts\n- Write: /src/output.ts\n- Edit: /src/edit.ts'); + }); + + it('formats Bash tool calls with command', () => { + const messages: Message[] = [ + { + role: 'assistant', + toolCalls: [{ tool: 'Bash', input: { command: 'npm test' } }], + }, + ]; + expect(formatToolCalls(messages)).toBe('- Bash: npm test'); + }); + + it('formats Grep/Glob tool calls with pattern', () => { + const messages: Message[] = [ + { + role: 'assistant', + toolCalls: [ + { tool: 'Grep', input: { pattern: 'TODO', path: '/src' } }, + { tool: 'Glob', input: { pattern: '**/*.ts' } }, + ], + }, + ]; + expect(formatToolCalls(messages)).toBe('- Grep: TODO\n- Glob: **/*.ts'); + }); + + it('formats mixed tool calls across multiple messages', () => { + const messages: Message[] = [ + { + role: 'assistant', + toolCalls: [ + { tool: 'Read', input: { file_path: '/package.json' } }, + { tool: 'Bash', input: { command: 'ls -la' } }, + ], + }, + { role: 'user', content: 'ok' }, + { + role: 'assistant', + toolCalls: [{ tool: 'Skill', input: { skill: 'review-pr' } }], + }, + ]; + const result = formatToolCalls(messages); + expect(result).toBe('- Read: /package.json\n- Bash: ls -la\n- Skill: review-pr'); + }); + + it('falls back to first short string field for unknown tools', () => { + const messages: Message[] = [ + { + role: 'assistant', + toolCalls: [{ tool: 'CustomTool', input: { query: 'find me something' } }], + }, + ]; + expect(formatToolCalls(messages)).toBe('- CustomTool: find me something'); + }); + + it('shows tool name only when input is empty', () => { + const messages: Message[] = [ + { + role: 'assistant', + toolCalls: [{ tool: 'SomeTool', input: {} }], + }, + ]; + expect(formatToolCalls(messages)).toBe('- SomeTool'); + }); + + it('shows tool name only when input is undefined', () => { + const messages: Message[] = [ + { + role: 'assistant', + toolCalls: [{ tool: 'SomeTool' }], + }, + ]; + expect(formatToolCalls(messages)).toBe('- SomeTool'); + }); + + it('truncates long input values', () => { + const longCommand = 'x'.repeat(200); + const messages: Message[] = [ + { + role: 'assistant', + toolCalls: [{ tool: 'Bash', input: { command: longCommand } }], + }, + ]; + const result = formatToolCalls(messages); + expect(result).toContain('- Bash: '); + // 120 chars + ellipsis + expect(result.length).toBeLessThan(200); + }); +});