Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -518,8 +518,10 @@ async def _handle_agent_update(
# Real domain tool — track for the UI. Synthetic handoff
# tools are filtered out because the framework already
# surfaces those as ``handoff_sent`` events.
self.track_function_call_start(name)
if self._ws_manager:
is_new_call = self.track_function_call_start(
name, getattr(content, "call_id", None)
)
if is_new_call and self._ws_manager:
await self._ws_manager.broadcast(
self.session_id,
{
Expand All @@ -533,7 +535,10 @@ async def _handle_agent_update(
if args_chunk:
self.track_function_call_arguments(args_chunk)
elif ctype == "function_result":
self.finalize_tool_tracking()
self.track_function_result(
getattr(content, "call_id", None),
getattr(content, "result", None),
)

text = getattr(update, "text", None)
if text:
Expand Down
29 changes: 20 additions & 9 deletions agentic_ai/agents/agent_framework/multi_agent/reflection_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,14 +181,19 @@ async def _run_agent_non_streaming(
for content in chunk.contents:
if content.type == "function_call":
if content.name:
self.track_function_call_start(content.name)
self.track_function_call_start(
content.name, getattr(content, "call_id", None)
)

args_chunk = getattr(content, 'arguments', '')
if args_chunk:
self.track_function_call_arguments(args_chunk)

elif content.type == "function_result":
self.finalize_tool_tracking()
self.track_function_result(
getattr(content, "call_id", None),
getattr(content, "result", None),
)

# Collect text
if hasattr(chunk, 'text') and chunk.text:
Expand Down Expand Up @@ -223,20 +228,26 @@ async def _run_agent_streaming(
for content in chunk.contents:
if content.type == "function_call":
if content.name:
self.track_function_call_start(content.name)
is_new_call = self.track_function_call_start(
content.name, getattr(content, "call_id", None)
)

await self._broadcast_raw({
"type": "tool_called",
"agent_id": agent_id,
"tool_name": content.name,
})
if is_new_call:
await self._broadcast_raw({
"type": "tool_called",
"agent_id": agent_id,
"tool_name": content.name,
})

args_chunk = getattr(content, 'arguments', '')
if args_chunk:
self.track_function_call_arguments(args_chunk)

elif content.type == "function_result":
self.finalize_tool_tracking()
self.track_function_result(
getattr(content, "call_id", None),
getattr(content, "result", None),
)

# Stream text
if hasattr(chunk, 'text') and chunk.text:
Expand Down
40 changes: 26 additions & 14 deletions agentic_ai/agents/agent_framework/single_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,21 +174,27 @@ async def chat_async(self, prompt: str) -> str:
if hasattr(chunk, 'contents') and chunk.contents:
for content in chunk.contents:
if content.type == "function_call":
# Function call chunks come in pieces:
# 1. First chunk has name, empty arguments
# 2. Subsequent chunks have no name, partial arguments
# Function call chunks come in pieces. Older SDKs sent
# the name only on the first chunk; agent-framework >= 1.7
# repeats name + a stable call_id on every chunk, so we
# de-duplicate on call_id to avoid fragmenting one call
# into many malformed ones.
if content.name:
# New function call starting - finalize previous if any
self.track_function_call_start(content.name)

self.track_function_call_start(
content.name, getattr(content, "call_id", None)
)

# Accumulate arguments
args_chunk = getattr(content, 'arguments', '')
if args_chunk:
self.track_function_call_arguments(args_chunk)

elif content.type == "function_result":
# Function result means the call is complete
self.finalize_tool_tracking()
self.track_function_result(
getattr(content, "call_id", None),
getattr(content, "result", None),
)

# Extract text
if hasattr(chunk, 'text') and chunk.text:
Expand Down Expand Up @@ -238,11 +244,14 @@ async def _chat_async_streaming(self, prompt: str) -> str:
# Handle function calls - accumulate arguments across chunks
if content.type == "function_call":
if content.name:
# New function call - finalize previous and start new
self.track_function_call_start(content.name)

# Broadcast that a tool is being called
if self._ws_manager:
# Only the first chunk of a given call_id starts a
# new call; later chunks just accumulate arguments.
is_new_call = self.track_function_call_start(
content.name, getattr(content, "call_id", None)
)

# Broadcast that a tool is being called (once per call)
if is_new_call and self._ws_manager:
await self._ws_manager.broadcast(
self.session_id,
{
Expand All @@ -259,8 +268,11 @@ async def _chat_async_streaming(self, prompt: str) -> str:
self.track_function_call_arguments(args_chunk)

elif content.type == "function_result":
# Function completed - finalize
self.finalize_tool_tracking()
# Function completed - finalize and capture result
self.track_function_result(
getattr(content, "call_id", None),
getattr(content, "result", None),
)

# Extract text from chunk
if hasattr(chunk, 'text') and chunk.text:
Expand Down
64 changes: 58 additions & 6 deletions agentic_ai/agents/base_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,33 @@ def get_tool_calls(self) -> List[Dict[str, Any]]:
"""
return self._tool_calls.copy()

def track_function_call_start(self, name: str) -> None:
"""Start tracking a new function call. Call when function_call content is received."""
# Finalize any previous function call first
def track_function_call_start(self, name: str, call_id: str | None = None) -> bool:
"""Begin tracking a function call, returning True only for a genuinely new call.

Streaming SDKs chunk function-call deltas differently. Some emit the tool
``name`` only on the first chunk; agent-framework >= 1.7 repeats both the
``name`` and a stable ``call_id`` on *every* delta chunk for the same
call. Without de-duplicating on ``call_id`` we would treat each argument
fragment (``{"``, ``customer``, ``_id`` ...) as a separate, malformed
tool call, which destroys tool-call-accuracy and task-adherence scoring.

When ``call_id`` matches the call already in progress, this is a
continuation: we keep accumulating and return False. Otherwise we
finalize the previous call, start a new one, and return True (useful for
one-shot side effects such as broadcasting a ``tool_called`` event).
"""
if (
call_id is not None
and self._current_function_call is not None
and self._current_function_call.get("call_id") == call_id
):
# Same streaming call continuing — do not finalize or restart.
return False
# Finalize any previous function call first, then start the new one.
self._finalize_current_function_call()
self._current_function_call = {"name": name}
self._current_function_call = {"name": name, "call_id": call_id}
self._current_function_args = []
return True

def track_function_call_arguments(self, arguments: str) -> None:
"""Accumulate streaming function call arguments."""
Expand All @@ -79,13 +100,42 @@ def _finalize_current_function_call(self) -> None:

self._tool_calls.append({
"name": self._current_function_call["name"],
"args": args
"args": args,
"call_id": self._current_function_call.get("call_id"),
"result": None,
})

# Reset accumulators
self._current_function_call = None
self._current_function_args = []

def track_function_result(self, call_id: str | None, result: Any) -> None:
"""Attach a tool result to its matching captured call.

Tool results are needed so that downstream evaluators (e.g.
task-adherence) can verify that the agent's claims are grounded in
actual tool output rather than fabricated. Finalizes the in-progress
call first, then matches the result to the most recent call sharing the
same ``call_id``.
"""
# The result signals the in-progress call has completed.
self._finalize_current_function_call()
if result is None:
return
result_str = str(result)
if len(result_str) > 2000:
result_str = result_str[:2000] + "…(truncated)"
if call_id:
for tc in reversed(self._tool_calls):
if tc.get("call_id") == call_id:
tc["result"] = result_str
return
# No call_id match — attach to the most recent call missing a result.
for tc in reversed(self._tool_calls):
if tc.get("result") is None:
tc["result"] = result_str
return

def finalize_tool_tracking(self) -> None:
"""Finalize any pending function calls. Call at end of streaming."""
self._finalize_current_function_call()
Expand All @@ -94,7 +144,9 @@ def add_tool_call(self, name: str, args: Dict[str, Any] | None = None) -> None:
"""Directly add a tool call (for non-streaming scenarios)."""
self._tool_calls.append({
"name": name,
"args": args or {}
"args": args or {},
"call_id": None,
"result": None,
})


Expand Down
46 changes: 37 additions & 9 deletions agentic_ai/evaluations/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -957,18 +957,31 @@ def evaluate_task_adherence(
except json.JSONDecodeError:
tool_args = {}

tool_call_id = tc.get("call_id") or tc.get("id") or f"call_{tool_name}"

response_messages.append({
"role": "assistant",
"content": None,
"tool_calls": [{
"id": tc.get("id", f"call_{tool_name}"),
"id": tool_call_id,
"type": "function",
"function": {
"name": tool_name,
"arguments": json.dumps(tool_args) if isinstance(tool_args, dict) else str(tool_args),
}
}]
})

# Include the tool RESULT so the judge can verify that the
# agent's claims are grounded in actual tool output. Without
# this, grounded responses look fabricated and task-adherence
# collapses to ~0.
tool_result = tc.get("result")
response_messages.append({
"role": "tool",
"tool_call_id": tool_call_id,
"content": str(tool_result) if tool_result else "(no tool result captured)",
})

# Add final response
response_messages.append({"role": "assistant", "content": response})
Expand All @@ -980,18 +993,32 @@ def evaluate_task_adherence(
task=task_description,
)

# TaskAdherenceEvaluator returns a numeric score
# Keep 1-5 scale for portal parity (0 for failures)
# Parse the score robustly across SDK versions.
#
# azure-ai-evaluation 1.14.0 changed TaskAdherenceEvaluator into a
# binary "flagged" grader: it returns ``task_adherence`` in {0.0, 1.0}
# (1.0 == adhered / not flagged) plus an authoritative
# ``task_adherence_result`` of "pass"/"fail". Older/other versions
# return a genuine 1-5 score. Treating the binary 1.0 as a 1-5 score
# and thresholding at >= 3.0 mis-records every PASS as a fail and
# displays it as "1.0/5". Normalize both shapes here.
raw_score = result.get("task_adherence", 0)

# Handle boolean or numeric
result_label = result.get("task_adherence_result")

if isinstance(raw_score, bool):
score = 5.0 if raw_score else 0.0
score = 5.0 if raw_score else 1.0
passed = bool(raw_score)
elif result_label in ("pass", "fail"):
# Binary "flagged" grader (SDK >= 1.14.0): trust the result label.
passed = result_label == "pass"
numeric = _safe_float(raw_score)
# If the SDK still reports a real 1-5 score (> 1), preserve it;
# otherwise map the binary verdict to the 1-5 display scale.
score = numeric if numeric > 1.0 else (5.0 if passed else 1.0)
else:
# Legacy 1-5 numeric score.
score = _safe_float(raw_score)

# Threshold: score >= 3 is passing
passed = score >= 3.0
passed = score >= 3.0

return EvaluationResult(
metric_name="task_adherence",
Expand All @@ -1000,6 +1027,7 @@ def evaluate_task_adherence(
passed=passed,
details={
"raw_result": result,
"task_adherence_result": result_label,
"tool_calls_count": len(tool_calls) if tool_calls else 0,
"task_description_length": len(task_description),
},
Expand Down
21 changes: 15 additions & 6 deletions agentic_ai/evaluations/run_agent_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,12 +275,21 @@ async def run_foundry_evaluation(traces: List[AgentTrace], data_file: Path, agen
)

with project_client:
# Get OpenAI client from the project
# Explicitly set api-version to ensure azure_ai_evaluator support
# (the SDK default should be 2025-11-15-preview but we force it to be safe)
openai_client = project_client.get_openai_client(
default_query={"api-version": "2025-11-15-preview"}
)
# Get the OpenAI-compatible client from the project.
#
# azure-ai-projects >= 2.1.0 routes get_openai_client() at the new
# unified "/v1" path, which REJECTS the legacy "api-version" query
# parameter ("api-version query parameter is not allowed when using
# /v1 path"). Older pre-release SDKs needed api-version forced. So we
# try the modern (no api-version) call first and only fall back to
# forcing api-version if the installed SDK requires it.
try:
openai_client = project_client.get_openai_client()
except TypeError:
# Very old SDKs require an explicit api_version kwarg.
openai_client = project_client.get_openai_client(
api_version="2025-11-15-preview"
)

# Diagnostic logging for CI debugging
import azure.ai.projects
Expand Down
Loading