From e57a33d44dfdca3c15e33a24911b1d72a32526b8 Mon Sep 17 00:00:00 2001 From: "James N." Date: Tue, 9 Jun 2026 19:25:57 -0700 Subject: [PATCH 1/2] fix(eval): make Foundry --remote push work with azure-ai-projects 2.1.0 get_openai_client(default_query={'api-version': ...}) breaks on azure-ai-projects >= 2.1.0: the new unified /v1 Foundry path rejects the legacy api-version query parameter (400: 'api-version query parameter is not allowed when using /v1 path'), so every --remote run failed to push results to Azure AI Foundry. Call get_openai_client() without forcing api-version, falling back to an explicit api_version kwarg only for older SDKs that require it. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- agentic_ai/evaluations/run_agent_eval.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/agentic_ai/evaluations/run_agent_eval.py b/agentic_ai/evaluations/run_agent_eval.py index 4b79cf0eb..147215951 100644 --- a/agentic_ai/evaluations/run_agent_eval.py +++ b/agentic_ai/evaluations/run_agent_eval.py @@ -275,12 +275,21 @@ async def run_foundry_evaluation(traces: List[AgentTrace], data_file: Path, agen ) with project_client: - # Get OpenAI client from the project - # Explicitly set api-version to ensure azure_ai_evaluator support - # (the SDK default should be 2025-11-15-preview but we force it to be safe) - openai_client = project_client.get_openai_client( - default_query={"api-version": "2025-11-15-preview"} - ) + # Get the OpenAI-compatible client from the project. + # + # azure-ai-projects >= 2.1.0 routes get_openai_client() at the new + # unified "/v1" path, which REJECTS the legacy "api-version" query + # parameter ("api-version query parameter is not allowed when using + # /v1 path"). Older pre-release SDKs needed api-version forced. So we + # try the modern (no api-version) call first and only fall back to + # forcing api-version if the installed SDK requires it. + try: + openai_client = project_client.get_openai_client() + except TypeError: + # Very old SDKs require an explicit api_version kwarg. + openai_client = project_client.get_openai_client( + api_version="2025-11-15-preview" + ) # Diagnostic logging for CI debugging import azure.ai.projects From d059f8404f029e456a515c3d8aba7e1ea4f707f4 Mon Sep 17 00:00:00 2001 From: "James N." Date: Tue, 9 Jun 2026 20:09:55 -0700 Subject: [PATCH 2/2] fix(eval): correct tool-call capture and task-adherence scoring Three related defects made tool_call_accuracy (~2.0) and task_adherence (~0.2) misleadingly low, independent of real agent quality. 1. Streaming tool-call capture (agent-framework >= 1.7): every function_call delta chunk now carries the tool name + a stable call_id, but the code assumed only the first chunk had a name. Each argument fragment was finalized as its own malformed tool call, turning one get_customer_detail({"customer_id":5}) into 6+ garbage calls with {"_raw": ...} args (and spamming duplicate tool_called UI events). track_function_call_start() now de-duplicates on call_id and only starts/broadcasts a genuinely new call. One real call = one clean call. 2. Tool results were never captured. function_result content carries the tool output (keyed by call_id) but was discarded. The mixin now records it via track_function_result(), so backend tools_used and the evaluator can see what each tool returned. 3. task_adherence judge could not verify grounding. metrics.py passed tool calls but no tool results, so grounded answers looked fabricated (judge: "no corroborating tool interactions"). It now emits role:tool result messages. Additionally, azure-ai-evaluation 1.14.0 made TaskAdherenceEvaluator a binary "flagged" grader (score in {0,1} plus a pass/fail result); the old code treated it as 1-5 and thresholded >= 3, so every PASS was recorded as a fail and shown as 1.0/5. Scoring now honors the pass/fail result and maps the binary verdict to a 1-5 display. Verified on a 5-case handoff run: tool calls captured cleanly (2-4 per query, 0 malformed); task_adherence avg 0.2 -> 3.4, with the remaining low scores being genuine hallucinations (agent citing data absent from tool output) rather than measurement artifacts. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../multi_agent/handoff_multi_domain_agent.py | 11 +++- .../multi_agent/reflection_agent.py | 29 ++++++--- .../agents/agent_framework/single_agent.py | 40 ++++++++---- agentic_ai/agents/base_agent.py | 64 +++++++++++++++++-- agentic_ai/evaluations/metrics.py | 46 ++++++++++--- 5 files changed, 149 insertions(+), 41 deletions(-) diff --git a/agentic_ai/agents/agent_framework/multi_agent/handoff_multi_domain_agent.py b/agentic_ai/agents/agent_framework/multi_agent/handoff_multi_domain_agent.py index 6a3cb1205..4f51d35a2 100644 --- a/agentic_ai/agents/agent_framework/multi_agent/handoff_multi_domain_agent.py +++ b/agentic_ai/agents/agent_framework/multi_agent/handoff_multi_domain_agent.py @@ -518,8 +518,10 @@ async def _handle_agent_update( # Real domain tool — track for the UI. Synthetic handoff # tools are filtered out because the framework already # surfaces those as ``handoff_sent`` events. - self.track_function_call_start(name) - if self._ws_manager: + is_new_call = self.track_function_call_start( + name, getattr(content, "call_id", None) + ) + if is_new_call and self._ws_manager: await self._ws_manager.broadcast( self.session_id, { @@ -533,7 +535,10 @@ async def _handle_agent_update( if args_chunk: self.track_function_call_arguments(args_chunk) elif ctype == "function_result": - self.finalize_tool_tracking() + self.track_function_result( + getattr(content, "call_id", None), + getattr(content, "result", None), + ) text = getattr(update, "text", None) if text: diff --git a/agentic_ai/agents/agent_framework/multi_agent/reflection_agent.py b/agentic_ai/agents/agent_framework/multi_agent/reflection_agent.py index fbb56f2a0..7641de1aa 100644 --- a/agentic_ai/agents/agent_framework/multi_agent/reflection_agent.py +++ b/agentic_ai/agents/agent_framework/multi_agent/reflection_agent.py @@ -181,14 +181,19 @@ async def _run_agent_non_streaming( for content in chunk.contents: if content.type == "function_call": if content.name: - self.track_function_call_start(content.name) + self.track_function_call_start( + content.name, getattr(content, "call_id", None) + ) args_chunk = getattr(content, 'arguments', '') if args_chunk: self.track_function_call_arguments(args_chunk) elif content.type == "function_result": - self.finalize_tool_tracking() + self.track_function_result( + getattr(content, "call_id", None), + getattr(content, "result", None), + ) # Collect text if hasattr(chunk, 'text') and chunk.text: @@ -223,20 +228,26 @@ async def _run_agent_streaming( for content in chunk.contents: if content.type == "function_call": if content.name: - self.track_function_call_start(content.name) + is_new_call = self.track_function_call_start( + content.name, getattr(content, "call_id", None) + ) - await self._broadcast_raw({ - "type": "tool_called", - "agent_id": agent_id, - "tool_name": content.name, - }) + if is_new_call: + await self._broadcast_raw({ + "type": "tool_called", + "agent_id": agent_id, + "tool_name": content.name, + }) args_chunk = getattr(content, 'arguments', '') if args_chunk: self.track_function_call_arguments(args_chunk) elif content.type == "function_result": - self.finalize_tool_tracking() + self.track_function_result( + getattr(content, "call_id", None), + getattr(content, "result", None), + ) # Stream text if hasattr(chunk, 'text') and chunk.text: diff --git a/agentic_ai/agents/agent_framework/single_agent.py b/agentic_ai/agents/agent_framework/single_agent.py index f66ae4fe1..1886f1d90 100644 --- a/agentic_ai/agents/agent_framework/single_agent.py +++ b/agentic_ai/agents/agent_framework/single_agent.py @@ -174,13 +174,16 @@ async def chat_async(self, prompt: str) -> str: if hasattr(chunk, 'contents') and chunk.contents: for content in chunk.contents: if content.type == "function_call": - # Function call chunks come in pieces: - # 1. First chunk has name, empty arguments - # 2. Subsequent chunks have no name, partial arguments + # Function call chunks come in pieces. Older SDKs sent + # the name only on the first chunk; agent-framework >= 1.7 + # repeats name + a stable call_id on every chunk, so we + # de-duplicate on call_id to avoid fragmenting one call + # into many malformed ones. if content.name: - # New function call starting - finalize previous if any - self.track_function_call_start(content.name) - + self.track_function_call_start( + content.name, getattr(content, "call_id", None) + ) + # Accumulate arguments args_chunk = getattr(content, 'arguments', '') if args_chunk: @@ -188,7 +191,10 @@ async def chat_async(self, prompt: str) -> str: elif content.type == "function_result": # Function result means the call is complete - self.finalize_tool_tracking() + self.track_function_result( + getattr(content, "call_id", None), + getattr(content, "result", None), + ) # Extract text if hasattr(chunk, 'text') and chunk.text: @@ -238,11 +244,14 @@ async def _chat_async_streaming(self, prompt: str) -> str: # Handle function calls - accumulate arguments across chunks if content.type == "function_call": if content.name: - # New function call - finalize previous and start new - self.track_function_call_start(content.name) - - # Broadcast that a tool is being called - if self._ws_manager: + # Only the first chunk of a given call_id starts a + # new call; later chunks just accumulate arguments. + is_new_call = self.track_function_call_start( + content.name, getattr(content, "call_id", None) + ) + + # Broadcast that a tool is being called (once per call) + if is_new_call and self._ws_manager: await self._ws_manager.broadcast( self.session_id, { @@ -259,8 +268,11 @@ async def _chat_async_streaming(self, prompt: str) -> str: self.track_function_call_arguments(args_chunk) elif content.type == "function_result": - # Function completed - finalize - self.finalize_tool_tracking() + # Function completed - finalize and capture result + self.track_function_result( + getattr(content, "call_id", None), + getattr(content, "result", None), + ) # Extract text from chunk if hasattr(chunk, 'text') and chunk.text: diff --git a/agentic_ai/agents/base_agent.py b/agentic_ai/agents/base_agent.py index 2c85b6551..7c7477f84 100644 --- a/agentic_ai/agents/base_agent.py +++ b/agentic_ai/agents/base_agent.py @@ -48,12 +48,33 @@ def get_tool_calls(self) -> List[Dict[str, Any]]: """ return self._tool_calls.copy() - def track_function_call_start(self, name: str) -> None: - """Start tracking a new function call. Call when function_call content is received.""" - # Finalize any previous function call first + def track_function_call_start(self, name: str, call_id: str | None = None) -> bool: + """Begin tracking a function call, returning True only for a genuinely new call. + + Streaming SDKs chunk function-call deltas differently. Some emit the tool + ``name`` only on the first chunk; agent-framework >= 1.7 repeats both the + ``name`` and a stable ``call_id`` on *every* delta chunk for the same + call. Without de-duplicating on ``call_id`` we would treat each argument + fragment (``{"``, ``customer``, ``_id`` ...) as a separate, malformed + tool call, which destroys tool-call-accuracy and task-adherence scoring. + + When ``call_id`` matches the call already in progress, this is a + continuation: we keep accumulating and return False. Otherwise we + finalize the previous call, start a new one, and return True (useful for + one-shot side effects such as broadcasting a ``tool_called`` event). + """ + if ( + call_id is not None + and self._current_function_call is not None + and self._current_function_call.get("call_id") == call_id + ): + # Same streaming call continuing — do not finalize or restart. + return False + # Finalize any previous function call first, then start the new one. self._finalize_current_function_call() - self._current_function_call = {"name": name} + self._current_function_call = {"name": name, "call_id": call_id} self._current_function_args = [] + return True def track_function_call_arguments(self, arguments: str) -> None: """Accumulate streaming function call arguments.""" @@ -79,13 +100,42 @@ def _finalize_current_function_call(self) -> None: self._tool_calls.append({ "name": self._current_function_call["name"], - "args": args + "args": args, + "call_id": self._current_function_call.get("call_id"), + "result": None, }) # Reset accumulators self._current_function_call = None self._current_function_args = [] + def track_function_result(self, call_id: str | None, result: Any) -> None: + """Attach a tool result to its matching captured call. + + Tool results are needed so that downstream evaluators (e.g. + task-adherence) can verify that the agent's claims are grounded in + actual tool output rather than fabricated. Finalizes the in-progress + call first, then matches the result to the most recent call sharing the + same ``call_id``. + """ + # The result signals the in-progress call has completed. + self._finalize_current_function_call() + if result is None: + return + result_str = str(result) + if len(result_str) > 2000: + result_str = result_str[:2000] + "…(truncated)" + if call_id: + for tc in reversed(self._tool_calls): + if tc.get("call_id") == call_id: + tc["result"] = result_str + return + # No call_id match — attach to the most recent call missing a result. + for tc in reversed(self._tool_calls): + if tc.get("result") is None: + tc["result"] = result_str + return + def finalize_tool_tracking(self) -> None: """Finalize any pending function calls. Call at end of streaming.""" self._finalize_current_function_call() @@ -94,7 +144,9 @@ def add_tool_call(self, name: str, args: Dict[str, Any] | None = None) -> None: """Directly add a tool call (for non-streaming scenarios).""" self._tool_calls.append({ "name": name, - "args": args or {} + "args": args or {}, + "call_id": None, + "result": None, }) diff --git a/agentic_ai/evaluations/metrics.py b/agentic_ai/evaluations/metrics.py index c0e657eeb..4f662d2a5 100644 --- a/agentic_ai/evaluations/metrics.py +++ b/agentic_ai/evaluations/metrics.py @@ -957,11 +957,13 @@ def evaluate_task_adherence( except json.JSONDecodeError: tool_args = {} + tool_call_id = tc.get("call_id") or tc.get("id") or f"call_{tool_name}" + response_messages.append({ "role": "assistant", "content": None, "tool_calls": [{ - "id": tc.get("id", f"call_{tool_name}"), + "id": tool_call_id, "type": "function", "function": { "name": tool_name, @@ -969,6 +971,17 @@ def evaluate_task_adherence( } }] }) + + # Include the tool RESULT so the judge can verify that the + # agent's claims are grounded in actual tool output. Without + # this, grounded responses look fabricated and task-adherence + # collapses to ~0. + tool_result = tc.get("result") + response_messages.append({ + "role": "tool", + "tool_call_id": tool_call_id, + "content": str(tool_result) if tool_result else "(no tool result captured)", + }) # Add final response response_messages.append({"role": "assistant", "content": response}) @@ -980,18 +993,32 @@ def evaluate_task_adherence( task=task_description, ) - # TaskAdherenceEvaluator returns a numeric score - # Keep 1-5 scale for portal parity (0 for failures) + # Parse the score robustly across SDK versions. + # + # azure-ai-evaluation 1.14.0 changed TaskAdherenceEvaluator into a + # binary "flagged" grader: it returns ``task_adherence`` in {0.0, 1.0} + # (1.0 == adhered / not flagged) plus an authoritative + # ``task_adherence_result`` of "pass"/"fail". Older/other versions + # return a genuine 1-5 score. Treating the binary 1.0 as a 1-5 score + # and thresholding at >= 3.0 mis-records every PASS as a fail and + # displays it as "1.0/5". Normalize both shapes here. raw_score = result.get("task_adherence", 0) - - # Handle boolean or numeric + result_label = result.get("task_adherence_result") + if isinstance(raw_score, bool): - score = 5.0 if raw_score else 0.0 + score = 5.0 if raw_score else 1.0 + passed = bool(raw_score) + elif result_label in ("pass", "fail"): + # Binary "flagged" grader (SDK >= 1.14.0): trust the result label. + passed = result_label == "pass" + numeric = _safe_float(raw_score) + # If the SDK still reports a real 1-5 score (> 1), preserve it; + # otherwise map the binary verdict to the 1-5 display scale. + score = numeric if numeric > 1.0 else (5.0 if passed else 1.0) else: + # Legacy 1-5 numeric score. score = _safe_float(raw_score) - - # Threshold: score >= 3 is passing - passed = score >= 3.0 + passed = score >= 3.0 return EvaluationResult( metric_name="task_adherence", @@ -1000,6 +1027,7 @@ def evaluate_task_adherence( passed=passed, details={ "raw_result": result, + "task_adherence_result": result_label, "tool_calls_count": len(tool_calls) if tool_calls else 0, "task_description_length": len(task_description), },