From e57a33d44dfdca3c15e33a24911b1d72a32526b8 Mon Sep 17 00:00:00 2001
From: "James N." <james.nguyen@microsoft.com>
Date: Tue, 9 Jun 2026 19:25:57 -0700
Subject: [PATCH 1/2] fix(eval): make Foundry --remote push work with
 azure-ai-projects 2.1.0

get_openai_client(default_query={'api-version': ...}) breaks on
azure-ai-projects >= 2.1.0: the new unified /v1 Foundry path rejects the
legacy api-version query parameter (400: 'api-version query parameter is
not allowed when using /v1 path'), so every --remote run failed to push
results to Azure AI Foundry.

Call get_openai_client() without forcing api-version, falling back to an
explicit api_version kwarg only for older SDKs that require it.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 agentic_ai/evaluations/run_agent_eval.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/agentic_ai/evaluations/run_agent_eval.py b/agentic_ai/evaluations/run_agent_eval.py
index 4b79cf0eb..147215951 100644
--- a/agentic_ai/evaluations/run_agent_eval.py
+++ b/agentic_ai/evaluations/run_agent_eval.py
@@ -275,12 +275,21 @@ async def run_foundry_evaluation(traces: List[AgentTrace], data_file: Path, agen
         )
         
         with project_client:
-            # Get OpenAI client from the project
-            # Explicitly set api-version to ensure azure_ai_evaluator support
-            # (the SDK default should be 2025-11-15-preview but we force it to be safe)
-            openai_client = project_client.get_openai_client(
-                default_query={"api-version": "2025-11-15-preview"}
-            )
+            # Get the OpenAI-compatible client from the project.
+            #
+            # azure-ai-projects >= 2.1.0 routes get_openai_client() at the new
+            # unified "/v1" path, which REJECTS the legacy "api-version" query
+            # parameter ("api-version query parameter is not allowed when using
+            # /v1 path"). Older pre-release SDKs needed api-version forced. So we
+            # try the modern (no api-version) call first and only fall back to
+            # forcing api-version if the installed SDK requires it.
+            try:
+                openai_client = project_client.get_openai_client()
+            except TypeError:
+                # Very old SDKs require an explicit api_version kwarg.
+                openai_client = project_client.get_openai_client(
+                    api_version="2025-11-15-preview"
+                )
             
             # Diagnostic logging for CI debugging
             import azure.ai.projects

From d059f8404f029e456a515c3d8aba7e1ea4f707f4 Mon Sep 17 00:00:00 2001
From: "James N." <james.nguyen@microsoft.com>
Date: Tue, 9 Jun 2026 20:09:55 -0700
Subject: [PATCH 2/2] fix(eval): correct tool-call capture and task-adherence
 scoring

Three related defects made tool_call_accuracy (~2.0) and task_adherence
(~0.2) misleadingly low, independent of real agent quality.

1. Streaming tool-call capture (agent-framework >= 1.7): every function_call
   delta chunk now carries the tool name + a stable call_id, but the code
   assumed only the first chunk had a name. Each argument fragment was
   finalized as its own malformed tool call, turning one
   get_customer_detail({"customer_id":5}) into 6+ garbage calls with
   {"_raw": ...} args (and spamming duplicate tool_called UI events).
   track_function_call_start() now de-duplicates on call_id and only
   starts/broadcasts a genuinely new call. One real call = one clean call.

2. Tool results were never captured. function_result content carries the
   tool output (keyed by call_id) but was discarded. The mixin now records
   it via track_function_result(), so backend tools_used and the evaluator
   can see what each tool returned.

3. task_adherence judge could not verify grounding. metrics.py passed tool
   calls but no tool results, so grounded answers looked fabricated (judge:
   "no corroborating tool interactions"). It now emits role:tool result
   messages. Additionally, azure-ai-evaluation 1.14.0 made
   TaskAdherenceEvaluator a binary "flagged" grader (score in {0,1} plus a
   pass/fail result); the old code treated it as 1-5 and thresholded >= 3,
   so every PASS was recorded as a fail and shown as 1.0/5. Scoring now
   honors the pass/fail result and maps the binary verdict to a 1-5 display.

Verified on a 5-case handoff run: tool calls captured cleanly (2-4 per
query, 0 malformed); task_adherence avg 0.2 -> 3.4, with the remaining low
scores being genuine hallucinations (agent citing data absent from tool
output) rather than measurement artifacts.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 .../multi_agent/handoff_multi_domain_agent.py | 11 +++-
 .../multi_agent/reflection_agent.py           | 29 ++++++---
 .../agents/agent_framework/single_agent.py    | 40 ++++++++----
 agentic_ai/agents/base_agent.py               | 64 +++++++++++++++++--
 agentic_ai/evaluations/metrics.py             | 46 ++++++++++---
 5 files changed, 149 insertions(+), 41 deletions(-)

diff --git a/agentic_ai/agents/agent_framework/multi_agent/handoff_multi_domain_agent.py b/agentic_ai/agents/agent_framework/multi_agent/handoff_multi_domain_agent.py
index 6a3cb1205..4f51d35a2 100644
--- a/agentic_ai/agents/agent_framework/multi_agent/handoff_multi_domain_agent.py
+++ b/agentic_ai/agents/agent_framework/multi_agent/handoff_multi_domain_agent.py
@@ -518,8 +518,10 @@ async def _handle_agent_update(
                     # Real domain tool — track for the UI. Synthetic handoff
                     # tools are filtered out because the framework already
                     # surfaces those as ``handoff_sent`` events.
-                    self.track_function_call_start(name)
-                    if self._ws_manager:
+                    is_new_call = self.track_function_call_start(
+                        name, getattr(content, "call_id", None)
+                    )
+                    if is_new_call and self._ws_manager:
                         await self._ws_manager.broadcast(
                             self.session_id,
                             {
@@ -533,7 +535,10 @@ async def _handle_agent_update(
                 if args_chunk:
                     self.track_function_call_arguments(args_chunk)
             elif ctype == "function_result":
-                self.finalize_tool_tracking()
+                self.track_function_result(
+                    getattr(content, "call_id", None),
+                    getattr(content, "result", None),
+                )
 
         text = getattr(update, "text", None)
         if text:
diff --git a/agentic_ai/agents/agent_framework/multi_agent/reflection_agent.py b/agentic_ai/agents/agent_framework/multi_agent/reflection_agent.py
index fbb56f2a0..7641de1aa 100644
--- a/agentic_ai/agents/agent_framework/multi_agent/reflection_agent.py
+++ b/agentic_ai/agents/agent_framework/multi_agent/reflection_agent.py
@@ -181,14 +181,19 @@ async def _run_agent_non_streaming(
                 for content in chunk.contents:
                     if content.type == "function_call":
                         if content.name:
-                            self.track_function_call_start(content.name)
+                            self.track_function_call_start(
+                                content.name, getattr(content, "call_id", None)
+                            )
                         
                         args_chunk = getattr(content, 'arguments', '')
                         if args_chunk:
                             self.track_function_call_arguments(args_chunk)
                     
                     elif content.type == "function_result":
-                        self.finalize_tool_tracking()
+                        self.track_function_result(
+                            getattr(content, "call_id", None),
+                            getattr(content, "result", None),
+                        )
             
             # Collect text
             if hasattr(chunk, 'text') and chunk.text:
@@ -223,20 +228,26 @@ async def _run_agent_streaming(
                 for content in chunk.contents:
                     if content.type == "function_call":
                         if content.name:
-                            self.track_function_call_start(content.name)
+                            is_new_call = self.track_function_call_start(
+                                content.name, getattr(content, "call_id", None)
+                            )
                             
-                            await self._broadcast_raw({
-                                "type": "tool_called",
-                                "agent_id": agent_id,
-                                "tool_name": content.name,
-                            })
+                            if is_new_call:
+                                await self._broadcast_raw({
+                                    "type": "tool_called",
+                                    "agent_id": agent_id,
+                                    "tool_name": content.name,
+                                })
                         
                         args_chunk = getattr(content, 'arguments', '')
                         if args_chunk:
                             self.track_function_call_arguments(args_chunk)
                     
                     elif content.type == "function_result":
-                        self.finalize_tool_tracking()
+                        self.track_function_result(
+                            getattr(content, "call_id", None),
+                            getattr(content, "result", None),
+                        )
             
             # Stream text
             if hasattr(chunk, 'text') and chunk.text:
diff --git a/agentic_ai/agents/agent_framework/single_agent.py b/agentic_ai/agents/agent_framework/single_agent.py
index f66ae4fe1..1886f1d90 100644
--- a/agentic_ai/agents/agent_framework/single_agent.py
+++ b/agentic_ai/agents/agent_framework/single_agent.py
@@ -174,13 +174,16 @@ async def chat_async(self, prompt: str) -> str:
             if hasattr(chunk, 'contents') and chunk.contents:
                 for content in chunk.contents:
                     if content.type == "function_call":
-                        # Function call chunks come in pieces:
-                        # 1. First chunk has name, empty arguments
-                        # 2. Subsequent chunks have no name, partial arguments
+                        # Function call chunks come in pieces. Older SDKs sent
+                        # the name only on the first chunk; agent-framework >= 1.7
+                        # repeats name + a stable call_id on every chunk, so we
+                        # de-duplicate on call_id to avoid fragmenting one call
+                        # into many malformed ones.
                         if content.name:
-                            # New function call starting - finalize previous if any
-                            self.track_function_call_start(content.name)
-                        
+                            self.track_function_call_start(
+                                content.name, getattr(content, "call_id", None)
+                            )
+
                         # Accumulate arguments
                         args_chunk = getattr(content, 'arguments', '')
                         if args_chunk:
@@ -188,7 +191,10 @@ async def chat_async(self, prompt: str) -> str:
                     
                     elif content.type == "function_result":
                         # Function result means the call is complete
-                        self.finalize_tool_tracking()
+                        self.track_function_result(
+                            getattr(content, "call_id", None),
+                            getattr(content, "result", None),
+                        )
             
             # Extract text
             if hasattr(chunk, 'text') and chunk.text:
@@ -238,11 +244,14 @@ async def _chat_async_streaming(self, prompt: str) -> str:
                         # Handle function calls - accumulate arguments across chunks
                         if content.type == "function_call":
                             if content.name:
-                                # New function call - finalize previous and start new
-                                self.track_function_call_start(content.name)
-                                
-                                # Broadcast that a tool is being called
-                                if self._ws_manager:
+                                # Only the first chunk of a given call_id starts a
+                                # new call; later chunks just accumulate arguments.
+                                is_new_call = self.track_function_call_start(
+                                    content.name, getattr(content, "call_id", None)
+                                )
+
+                                # Broadcast that a tool is being called (once per call)
+                                if is_new_call and self._ws_manager:
                                     await self._ws_manager.broadcast(
                                         self.session_id,
                                         {
@@ -259,8 +268,11 @@ async def _chat_async_streaming(self, prompt: str) -> str:
                                 self.track_function_call_arguments(args_chunk)
                         
                         elif content.type == "function_result":
-                            # Function completed - finalize
-                            self.finalize_tool_tracking()
+                            # Function completed - finalize and capture result
+                            self.track_function_result(
+                                getattr(content, "call_id", None),
+                                getattr(content, "result", None),
+                            )
                 
                 # Extract text from chunk
                 if hasattr(chunk, 'text') and chunk.text:
diff --git a/agentic_ai/agents/base_agent.py b/agentic_ai/agents/base_agent.py
index 2c85b6551..7c7477f84 100644
--- a/agentic_ai/agents/base_agent.py
+++ b/agentic_ai/agents/base_agent.py
@@ -48,12 +48,33 @@ def get_tool_calls(self) -> List[Dict[str, Any]]:
         """
         return self._tool_calls.copy()
     
-    def track_function_call_start(self, name: str) -> None:
-        """Start tracking a new function call. Call when function_call content is received."""
-        # Finalize any previous function call first
+    def track_function_call_start(self, name: str, call_id: str | None = None) -> bool:
+        """Begin tracking a function call, returning True only for a genuinely new call.
+
+        Streaming SDKs chunk function-call deltas differently. Some emit the tool
+        ``name`` only on the first chunk; agent-framework >= 1.7 repeats both the
+        ``name`` and a stable ``call_id`` on *every* delta chunk for the same
+        call. Without de-duplicating on ``call_id`` we would treat each argument
+        fragment (``{"``, ``customer``, ``_id`` ...) as a separate, malformed
+        tool call, which destroys tool-call-accuracy and task-adherence scoring.
+
+        When ``call_id`` matches the call already in progress, this is a
+        continuation: we keep accumulating and return False. Otherwise we
+        finalize the previous call, start a new one, and return True (useful for
+        one-shot side effects such as broadcasting a ``tool_called`` event).
+        """
+        if (
+            call_id is not None
+            and self._current_function_call is not None
+            and self._current_function_call.get("call_id") == call_id
+        ):
+            # Same streaming call continuing — do not finalize or restart.
+            return False
+        # Finalize any previous function call first, then start the new one.
         self._finalize_current_function_call()
-        self._current_function_call = {"name": name}
+        self._current_function_call = {"name": name, "call_id": call_id}
         self._current_function_args = []
+        return True
     
     def track_function_call_arguments(self, arguments: str) -> None:
         """Accumulate streaming function call arguments."""
@@ -79,13 +100,42 @@ def _finalize_current_function_call(self) -> None:
         
         self._tool_calls.append({
             "name": self._current_function_call["name"],
-            "args": args
+            "args": args,
+            "call_id": self._current_function_call.get("call_id"),
+            "result": None,
         })
         
         # Reset accumulators
         self._current_function_call = None
         self._current_function_args = []
     
+    def track_function_result(self, call_id: str | None, result: Any) -> None:
+        """Attach a tool result to its matching captured call.
+
+        Tool results are needed so that downstream evaluators (e.g.
+        task-adherence) can verify that the agent's claims are grounded in
+        actual tool output rather than fabricated. Finalizes the in-progress
+        call first, then matches the result to the most recent call sharing the
+        same ``call_id``.
+        """
+        # The result signals the in-progress call has completed.
+        self._finalize_current_function_call()
+        if result is None:
+            return
+        result_str = str(result)
+        if len(result_str) > 2000:
+            result_str = result_str[:2000] + "…(truncated)"
+        if call_id:
+            for tc in reversed(self._tool_calls):
+                if tc.get("call_id") == call_id:
+                    tc["result"] = result_str
+                    return
+        # No call_id match — attach to the most recent call missing a result.
+        for tc in reversed(self._tool_calls):
+            if tc.get("result") is None:
+                tc["result"] = result_str
+                return
+    
     def finalize_tool_tracking(self) -> None:
         """Finalize any pending function calls. Call at end of streaming."""
         self._finalize_current_function_call()
@@ -94,7 +144,9 @@ def add_tool_call(self, name: str, args: Dict[str, Any] | None = None) -> None:
         """Directly add a tool call (for non-streaming scenarios)."""
         self._tool_calls.append({
             "name": name,
-            "args": args or {}
+            "args": args or {},
+            "call_id": None,
+            "result": None,
         })
 
 
diff --git a/agentic_ai/evaluations/metrics.py b/agentic_ai/evaluations/metrics.py
index c0e657eeb..4f662d2a5 100644
--- a/agentic_ai/evaluations/metrics.py
+++ b/agentic_ai/evaluations/metrics.py
@@ -957,11 +957,13 @@ def evaluate_task_adherence(
                         except json.JSONDecodeError:
                             tool_args = {}
                     
+                    tool_call_id = tc.get("call_id") or tc.get("id") or f"call_{tool_name}"
+                    
                     response_messages.append({
                         "role": "assistant",
                         "content": None,
                         "tool_calls": [{
-                            "id": tc.get("id", f"call_{tool_name}"),
+                            "id": tool_call_id,
                             "type": "function",
                             "function": {
                                 "name": tool_name,
@@ -969,6 +971,17 @@ def evaluate_task_adherence(
                             }
                         }]
                     })
+                    
+                    # Include the tool RESULT so the judge can verify that the
+                    # agent's claims are grounded in actual tool output. Without
+                    # this, grounded responses look fabricated and task-adherence
+                    # collapses to ~0.
+                    tool_result = tc.get("result")
+                    response_messages.append({
+                        "role": "tool",
+                        "tool_call_id": tool_call_id,
+                        "content": str(tool_result) if tool_result else "(no tool result captured)",
+                    })
             
             # Add final response
             response_messages.append({"role": "assistant", "content": response})
@@ -980,18 +993,32 @@ def evaluate_task_adherence(
                 task=task_description,
             )
             
-            # TaskAdherenceEvaluator returns a numeric score
-            # Keep 1-5 scale for portal parity (0 for failures)
+            # Parse the score robustly across SDK versions.
+            #
+            # azure-ai-evaluation 1.14.0 changed TaskAdherenceEvaluator into a
+            # binary "flagged" grader: it returns ``task_adherence`` in {0.0, 1.0}
+            # (1.0 == adhered / not flagged) plus an authoritative
+            # ``task_adherence_result`` of "pass"/"fail". Older/other versions
+            # return a genuine 1-5 score. Treating the binary 1.0 as a 1-5 score
+            # and thresholding at >= 3.0 mis-records every PASS as a fail and
+            # displays it as "1.0/5". Normalize both shapes here.
             raw_score = result.get("task_adherence", 0)
-            
-            # Handle boolean or numeric
+            result_label = result.get("task_adherence_result")
+
             if isinstance(raw_score, bool):
-                score = 5.0 if raw_score else 0.0
+                score = 5.0 if raw_score else 1.0
+                passed = bool(raw_score)
+            elif result_label in ("pass", "fail"):
+                # Binary "flagged" grader (SDK >= 1.14.0): trust the result label.
+                passed = result_label == "pass"
+                numeric = _safe_float(raw_score)
+                # If the SDK still reports a real 1-5 score (> 1), preserve it;
+                # otherwise map the binary verdict to the 1-5 display scale.
+                score = numeric if numeric > 1.0 else (5.0 if passed else 1.0)
             else:
+                # Legacy 1-5 numeric score.
                 score = _safe_float(raw_score)
-            
-            # Threshold: score >= 3 is passing
-            passed = score >= 3.0
+                passed = score >= 3.0
             
             return EvaluationResult(
                 metric_name="task_adherence",
@@ -1000,6 +1027,7 @@ def evaluate_task_adherence(
                 passed=passed,
                 details={
                     "raw_result": result,
+                    "task_adherence_result": result_label,
                     "tool_calls_count": len(tool_calls) if tool_calls else 0,
                     "task_description_length": len(task_description),
                 },