diff --git a/packages/agent-runtime/src/__tests__/tool-stream-parser.test.ts b/packages/agent-runtime/src/__tests__/tool-stream-parser.test.ts
index 391610c27..c258ff96e 100644
--- a/packages/agent-runtime/src/__tests__/tool-stream-parser.test.ts
+++ b/packages/agent-runtime/src/__tests__/tool-stream-parser.test.ts
@@ -47,10 +47,6 @@ describe('processStreamWithTags', () => {
       },
     }
 
-    function onError(name: string, error: string) {
-      events.push({ name, error })
-    }
-
     const result: string[] = []
     const responseChunks: any[] = []
 
@@ -70,7 +66,6 @@ describe('processStreamWithTags', () => {
       stream,
       processors,
       defaultProcessor,
-      onError,
       onResponseChunk,
       executeXmlToolCall: async () => {},
     })) {
@@ -114,10 +109,6 @@ describe('processStreamWithTags', () => {
       },
     }
 
-    function onError(name: string, error: string) {
-      events.push({ name, error })
-    }
-
     const result: string[] = []
     const responseChunks: any[] = []
 
@@ -137,7 +128,6 @@ describe('processStreamWithTags', () => {
       stream,
       processors,
       defaultProcessor,
-      onError,
       onResponseChunk,
       executeXmlToolCall: async () => {},
     })) {
@@ -191,10 +181,6 @@ describe('processStreamWithTags', () => {
       },
     }
 
-    function onError(name: string, error: string) {
-      events.push({ name, error })
-    }
-
     const result: string[] = []
     const responseChunks: any[] = []
 
@@ -214,7 +200,6 @@ describe('processStreamWithTags', () => {
       stream,
       processors,
       defaultProcessor,
-      onError,
       onResponseChunk,
       executeXmlToolCall: async () => {},
     })) {
@@ -267,10 +252,6 @@ describe('processStreamWithTags', () => {
       },
     }
 
-    function onError(name: string, error: string) {
-      events.push({ name, error, type: 'error' })
-    }
-
     const responseChunks: any[] = []
 
     function onResponseChunk(chunk: any) {
@@ -295,7 +276,6 @@ describe('processStreamWithTags', () => {
       stream,
       processors,
       defaultProcessor,
-      onError,
       onResponseChunk,
       executeXmlToolCall: async () => {},
     })) {
@@ -341,10 +321,6 @@ describe('processStreamWithTags', () => {
       },
     }
 
-    function onError(name: string, error: string) {
-      events.push({ name, error, type: 'error' })
-    }
-
     const result: string[] = []
     const responseChunks: any[] = []
 
@@ -364,7 +340,6 @@ describe('processStreamWithTags', () => {
       stream,
       processors,
       defaultProcessor,
-      onError,
       onResponseChunk,
       executeXmlToolCall: async () => {},
     })) {
@@ -414,10 +389,6 @@ describe('processStreamWithTags', () => {
       },
     }
 
-    function onError(name: string, error: string) {
-      events.push({ name, error, type: 'error' })
-    }
-
     const result: string[] = []
     const responseChunks: any[] = []
 
@@ -437,7 +408,6 @@ describe('processStreamWithTags', () => {
       stream,
       processors,
       defaultProcessor,
-      onError,
       onResponseChunk,
       executeXmlToolCall: async () => {},
     })) {
@@ -468,10 +438,6 @@ describe('processStreamWithTags', () => {
 
     const processors = {}
 
-    function onError(name: string, error: string) {
-      events.push({ name, error, type: 'error' })
-    }
-
     const result: string[] = []
     const responseChunks: any[] = []
 
@@ -491,7 +457,6 @@ describe('processStreamWithTags', () => {
       stream,
       processors,
       defaultProcessor,
-      onError,
       onResponseChunk,
       executeXmlToolCall: async () => {},
     })) {
@@ -515,10 +480,6 @@ describe('processStreamWithTags', () => {
 
     const processors = {}
 
-    function onError(name: string, error: string) {
-      events.push({ name, error, type: 'error' })
-    }
-
     const result: string[] = []
     const responseChunks: any[] = []
 
@@ -538,7 +499,6 @@ describe('processStreamWithTags', () => {
       stream,
       processors,
       defaultProcessor,
-      onError,
       onResponseChunk,
       executeXmlToolCall: async () => {},
     })) {
diff --git a/packages/agent-runtime/src/__tests__/tool-validation-error.test.ts b/packages/agent-runtime/src/__tests__/tool-validation-error.test.ts
index d9ea5d89f..df9c1997d 100644
--- a/packages/agent-runtime/src/__tests__/tool-validation-error.test.ts
+++ b/packages/agent-runtime/src/__tests__/tool-validation-error.test.ts
@@ -1,6 +1,7 @@
 import { TEST_AGENT_RUNTIME_IMPL } from '@codebuff/common/testing/impl/agent-runtime'
 import { getInitialSessionState } from '@codebuff/common/types/session-state'
 import { promptSuccess } from '@codebuff/common/util/error'
+import { jsonToolResult } from '@codebuff/common/util/messages'
 import { beforeEach, describe, expect, it } from 'bun:test'
 
 import { mockFileContext } from './test-utils'
@@ -12,6 +13,10 @@ import type {
   AgentRuntimeScopedDeps,
 } from '@codebuff/common/types/contracts/agent-runtime'
 import type { StreamChunk } from '@codebuff/common/types/contracts/llm'
+import type {
+  AssistantMessage,
+  ToolMessage,
+} from '@codebuff/common/types/messages/codebuff-message'
 import type { PrintModeEvent } from '@codebuff/common/types/print-mode'
 
 describe('tool validation error handling', () => {
@@ -225,4 +230,127 @@ describe('tool validation error handling', () => {
     )
     expect(errorEvents.length).toBe(0)
   })
+
+  it('should preserve tool_call/tool_result ordering when custom tool setup is async', async () => {
+    const toolName = 'delayed_custom_tool'
+    const agentWithCustomTool: AgentTemplate = {
+      ...testAgentTemplate,
+      toolNames: [toolName, 'end_turn'],
+    }
+
+    const delayedToolCallChunk: StreamChunk = {
+      type: 'tool-call',
+      toolName,
+      toolCallId: 'delayed-custom-tool-call-id',
+      input: {
+        query: 'test',
+      },
+    }
+
+    async function* mockStream() {
+      yield delayedToolCallChunk
+      return promptSuccess('mock-message-id')
+    }
+
+    const fileContextWithCustomTool = {
+      ...mockFileContext,
+      customToolDefinitions: {
+        [toolName]: {
+          inputSchema: {
+            type: 'object',
+            properties: {
+              query: { type: 'string' },
+            },
+            required: ['query'],
+            additionalProperties: false,
+          },
+          endsAgentStep: false,
+          description: 'A delayed custom tool for ordering tests',
+        },
+      },
+    }
+
+    const sessionState = getInitialSessionState(fileContextWithCustomTool)
+    const agentState = sessionState.mainAgentState
+
+    agentRuntimeImpl.requestMcpToolData = async () => {
+      // Force an async gap so tool_call emission happens after stream completion.
+      await new Promise((resolve) => setTimeout(resolve, 20))
+      return []
+    }
+    agentRuntimeImpl.requestToolCall = async () => ({
+      output: jsonToolResult({ ok: true }),
+    })
+
+    await processStream({
+      ...agentRuntimeImpl,
+      agentContext: {},
+      agentState,
+      agentStepId: 'test-step-id',
+      agentTemplate: agentWithCustomTool,
+      ancestorRunIds: [],
+      clientSessionId: 'test-session',
+      fileContext: fileContextWithCustomTool,
+      fingerprintId: 'test-fingerprint',
+      fullResponse: '',
+      localAgentTemplates: { 'test-agent': agentWithCustomTool },
+      messages: [],
+      prompt: 'test prompt',
+      repoId: undefined,
+      repoUrl: undefined,
+      runId: 'test-run-id',
+      signal: new AbortController().signal,
+      stream: mockStream(),
+      system: 'test system',
+      tools: {},
+      userId: 'test-user',
+      userInputId: 'test-input-id',
+      onCostCalculated: async () => {},
+      onResponseChunk: () => {},
+    })
+
+    const assistantToolCallMessages = agentState.messageHistory.filter(
+      (m): m is AssistantMessage =>
+        m.role === 'assistant' &&
+        m.content.some((c) => c.type === 'tool-call' && c.toolName === toolName),
+    )
+    const toolMessages = agentState.messageHistory.filter(
+      (m): m is ToolMessage => m.role === 'tool' && m.toolName === toolName,
+    )
+
+    expect(assistantToolCallMessages.length).toBe(1)
+    expect(toolMessages.length).toBe(1)
+
+    const assistantToolCallPart = assistantToolCallMessages[0].content.find(
+      (
+        c,
+      ): c is Extract<AssistantMessage['content'][number], { type: 'tool-call' }> =>
+        c.type === 'tool-call' && c.toolName === toolName,
+    )
+    expect(assistantToolCallPart).toBeDefined()
+    expect(toolMessages[0].toolCallId).toBe(assistantToolCallPart!.toolCallId)
+
+    const assistantIndex = agentState.messageHistory.indexOf(
+      assistantToolCallMessages[0],
+    )
+    const toolResultIndex = agentState.messageHistory.indexOf(toolMessages[0])
+    expect(assistantIndex).toBeGreaterThanOrEqual(0)
+    expect(toolResultIndex).toBeGreaterThan(assistantIndex)
+
+    const assistantToolCallIds = new Set(
+      agentState.messageHistory.flatMap((message) => {
+        if (message.role !== 'assistant') {
+          return []
+        }
+        return message.content.flatMap((part) =>
+          part.type === 'tool-call' ? [part.toolCallId] : [],
+        )
+      }),
+    )
+    const orphanToolResults = agentState.messageHistory.filter(
+      (message): message is ToolMessage =>
+        message.role === 'tool' && !assistantToolCallIds.has(message.toolCallId),
+    )
+    expect(orphanToolResults.length).toBe(0)
+  })
 })
diff --git a/packages/agent-runtime/src/__tests__/xml-tool-result-ordering.test.ts b/packages/agent-runtime/src/__tests__/xml-tool-result-ordering.test.ts
index e187abfe2..65c6742d8 100644
--- a/packages/agent-runtime/src/__tests__/xml-tool-result-ordering.test.ts
+++ b/packages/agent-runtime/src/__tests__/xml-tool-result-ordering.test.ts
@@ -59,7 +59,6 @@ describe('XML tool result ordering', () => {
       stream,
       processors: {},
       defaultProcessor,
-      onError: () => {},
       onResponseChunk,
       executeXmlToolCall: async ({ toolName, input }) => {
         executionOrder.push(`executeXmlToolCall:${toolName}`)
@@ -136,7 +135,6 @@ describe('XML tool result ordering', () => {
       stream,
       processors: {},
       defaultProcessor,
-      onError: () => {},
       onResponseChunk,
       executeXmlToolCall: async ({ toolName }) => {
         // Simulate tool_call event
@@ -206,7 +204,6 @@ describe('XML tool result ordering', () => {
         stream,
         processors: {},
         defaultProcessor: () => ({ onTagStart: () => {}, onTagEnd: () => {} }),
-        onError: () => {},
         onResponseChunk: () => {},
         executeXmlToolCall: async () => {
           // Simulate tool execution with async work
diff --git a/packages/agent-runtime/src/run-programmatic-step.ts b/packages/agent-runtime/src/run-programmatic-step.ts
index 07cc27a06..64addd410 100644
--- a/packages/agent-runtime/src/run-programmatic-step.ts
+++ b/packages/agent-runtime/src/run-programmatic-step.ts
@@ -81,8 +81,9 @@ export async function runProgrammaticStep(
     | 'fileProcessingState'
     | 'toolCallId'
     | 'toolCalls'
+    | 'toolCallsToAddToMessageHistory'
     | 'toolResults'
-    | 'toolResultsToAddAfterStream'
+    | 'toolResultsToAddToMessageHistory'
   > &
     ParamsExcluding<
       AddAgentStepFn,
@@ -137,16 +138,16 @@ export async function runProgrammaticStep(
   if (!generator) {
     const createLogMethod =
       (level: 'debug' | 'info' | 'warn' | 'error') =>
-      (data: any, msg?: string) => {
-        logger[level](data, msg) // Log to backend
-        handleStepsLogChunk({
-          userInputId,
-          runId: agentState.runId ?? 'undefined',
-          level,
-          data,
-          message: msg,
-        })
-      }
+        (data: any, msg?: string) => {
+          logger[level](data, msg) // Log to backend
+          handleStepsLogChunk({
+            userInputId,
+            runId: agentState.runId ?? 'undefined',
+            level,
+            data,
+            message: msg,
+          })
+        }
 
     const streamingLogger = {
       debug: createLogMethod('debug'),
@@ -243,7 +244,7 @@ export async function runProgrammaticStep(
       if (!parseResult.success) {
         throw new Error(
           `Invalid yield value from handleSteps in agent ${template.id}: ${parseResult.error.message}. ` +
-            `Received: ${JSON.stringify(result.value)}`,
+          `Received: ${JSON.stringify(result.value)}`,
         )
       }
 
@@ -334,9 +335,8 @@ export async function runProgrammaticStep(
   } catch (error) {
     endTurn = true
 
-    const errorMessage = `Error executing handleSteps for agent ${template.id}: ${
-      error instanceof Error ? error.message : 'Unknown error'
-    }`
+    const errorMessage = `Error executing handleSteps for agent ${template.id}: ${error instanceof Error ? error.message : 'Unknown error'
+      }`
     logger.error(
       { error: getErrorObject(error), template: template.id },
       errorMessage,
@@ -428,7 +428,8 @@ type ExecuteToolCallsArrayParams = Omit<
   | 'autoInsertEndStepParam'
   | 'excludeToolFromMessageHistory'
   | 'toolCallId'
-  | 'toolResultsToAddAfterStream'
+  | 'toolCallsToAddToMessageHistory'
+  | 'toolResultsToAddToMessageHistory'
 > & {
   agentState: AgentState
   onResponseChunk: (chunk: string | PrintModeEvent) => void
@@ -485,6 +486,7 @@ async function executeSingleToolCall(
     // })
   }
 
+  const toolResultsToAddToMessageHistory: ToolMessage[] = []
   // Execute the tool call
   await executeToolCall({
     ...params,
@@ -494,7 +496,9 @@ async function executeSingleToolCall(
     excludeToolFromMessageHistory,
     fromHandleSteps: true,
     toolCallId,
-    toolResultsToAddAfterStream: [],
+    toolCalls: [],
+    toolCallsToAddToMessageHistory: [],
+    toolResultsToAddToMessageHistory,
 
     onResponseChunk: (chunk: string | PrintModeEvent) => {
       if (typeof chunk === 'string') {
@@ -539,6 +543,9 @@ async function executeSingleToolCall(
     },
   })
 
+  agentState.messageHistory = [...agentState.messageHistory]
+  agentState.messageHistory.push(...toolResultsToAddToMessageHistory)
+
   // Get the latest tool result
   return toolResults[toolResults.length - 1]?.content
 }
diff --git a/packages/agent-runtime/src/tool-stream-parser.ts b/packages/agent-runtime/src/tool-stream-parser.ts
index 7beea5485..543a07f62 100644
--- a/packages/agent-runtime/src/tool-stream-parser.ts
+++ b/packages/agent-runtime/src/tool-stream-parser.ts
@@ -21,15 +21,26 @@ export async function* processStreamWithTools(params: {
   processors: Record<
     string,
     {
-      onTagStart: (tagName: string, attributes: Record<string, string>) => void
-      onTagEnd: (tagName: string, params: Record<string, any>) => void
+      onTagStart: (
+        tagName: string,
+        attributes: Record<string, string>,
+      ) => void | Promise<void>
+      onTagEnd: (
+        tagName: string,
+        params: Record<string, any>,
+      ) => void | Promise<void>
     }
   >
   defaultProcessor: (toolName: string) => {
-    onTagStart: (tagName: string, attributes: Record<string, string>) => void
-    onTagEnd: (tagName: string, params: Record<string, any>) => void
+    onTagStart: (
+      tagName: string,
+      attributes: Record<string, string>,
+    ) => void | Promise<void>
+    onTagEnd: (
+      tagName: string,
+      params: Record<string, any>,
+    ) => void | Promise<void>
   }
-  onError: (tagName: string, errorMessage: string) => void
   onResponseChunk: (chunk: PrintModeText | PrintModeError) => void
   logger: Logger
   loggerOptions?: {
@@ -48,7 +59,6 @@ export async function* processStreamWithTools(params: {
     stream,
     processors,
     defaultProcessor,
-    onError: _onError,
     onResponseChunk,
     logger,
     loggerOptions,
@@ -62,11 +72,11 @@ export async function* processStreamWithTools(params: {
   // State for parsing XML tool calls from text stream
   const xmlParserState: StreamParserState = createStreamParserState()
 
-  function processToolCallObject(params: {
+  async function processToolCallObject(params: {
     toolName: string
     input: any
     contents?: string
-  }): void {
+  }): Promise<void> {
     const { toolName, input, contents } = params
 
     const processor = processors[toolName] ?? defaultProcessor(toolName)
@@ -85,8 +95,8 @@ export async function* processStreamWithTools(params: {
       logger,
     })
 
-    processor.onTagStart(toolName, {})
-    processor.onTagEnd(toolName, input)
+    await processor.onTagStart(toolName, {})
+    await processor.onTagEnd(toolName, input)
   }
 
   function flush() {
@@ -146,7 +156,7 @@ export async function* processStreamWithTools(params: {
     }
 
     if (chunk.type === 'tool-call') {
-      processToolCallObject(chunk)
+      await processToolCallObject(chunk)
     }
 
     yield chunk
diff --git a/packages/agent-runtime/src/tools/stream-parser.ts b/packages/agent-runtime/src/tools/stream-parser.ts
index ac23372a6..0af5ce5b1 100644
--- a/packages/agent-runtime/src/tools/stream-parser.ts
+++ b/packages/agent-runtime/src/tools/stream-parser.ts
@@ -1,12 +1,10 @@
 import { toolNames } from '@codebuff/common/tools/constants'
 import { buildArray } from '@codebuff/common/util/array'
 import {
-  jsonToolResult,
   assistantMessage,
   userMessage,
 } from '@codebuff/common/util/messages'
 import { generateCompactId } from '@codebuff/common/util/string'
-import { cloneDeep } from 'lodash'
 
 import { processStreamWithTools } from '../tool-stream-parser'
 import {
@@ -14,7 +12,7 @@ import {
   executeToolCall,
   tryTransformAgentToolCall,
 } from './tool-executor'
-import { expireMessages, withSystemTags } from '../util/messages'
+import { withSystemTags } from '../util/messages'
 
 import type { CustomToolCall, ExecuteToolCallParams } from './tool-executor'
 import type { AgentTemplate } from '../templates/types'
@@ -58,15 +56,15 @@ export async function processStream(
     | 'state'
     | 'toolCallId'
     | 'toolCalls'
+    | 'toolCallsToAddToMessageHistory'
     | 'toolName'
     | 'toolResults'
-    | 'toolResultsToAddAfterStream'
+    | 'toolResultsToAddToMessageHistory'
   > &
     ParamsExcluding<
       typeof processStreamWithTools,
       | 'processors'
       | 'defaultProcessor'
-      | 'onError'
       | 'loggerOptions'
       | 'executeXmlToolCall'
     >,
@@ -87,8 +85,9 @@ export async function processStream(
 
   // === MUTABLE STATE ===
   const toolResults: ToolMessage[] = []
-  const toolResultsToAddAfterStream: ToolMessage[] = []
+  const toolResultsToAddToMessageHistory: ToolMessage[] = []
   const toolCalls: (CodebuffToolCall | CustomToolCall)[] = []
+  const toolCallsToAddToMessageHistory: (CodebuffToolCall | CustomToolCall)[] = []
   const assistantMessages: Message[] = []
   let hadToolCallError = false
   const errorMessages: Message[] = []
@@ -107,22 +106,10 @@ export async function processStream(
   // === RESPONSE HANDLER ===
   // Creates a response handler that captures tool events into assistantMessages.
   // When isXmlMode=true, also captures tool_result events for interleaved ordering.
-  function createResponseHandler(isXmlMode: boolean) {
+  function createResponseHandler() {
     return (chunk: string | PrintModeEvent) => {
       if (typeof chunk !== 'string') {
-        if (chunk.type === 'tool_call') {
-          assistantMessages.push(
-            assistantMessage({ ...chunk, type: 'tool-call' }),
-          )
-        } else if (isXmlMode && chunk.type === 'tool_result') {
-          const toolResultMessage: ToolMessage = {
-            role: 'tool',
-            toolName: chunk.toolName,
-            toolCallId: chunk.toolCallId,
-            content: chunk.output,
-          }
-          assistantMessages.push(toolResultMessage)
-        } else if (chunk.type === 'error') {
+        if (chunk.type === 'error') {
           hadToolCallError = true
           errorMessages.push(
             userMessage(
@@ -139,14 +126,10 @@ export async function processStream(
 
   // === TOOL EXECUTION ===
   // Unified callback factory for both native and custom tools.
-  // isXmlMode=true: execute immediately, capture results inline (for XML tool calls)
-  // isXmlMode=false: defer execution, results added at end (for native tool calls)
   function createToolExecutionCallback(toolName: string, isXmlMode: boolean) {
-    const responseHandler = createResponseHandler(isXmlMode)
-    const resultsArray = isXmlMode ? [] : toolResultsToAddAfterStream
-
+    const responseHandler = createResponseHandler()
     return {
-      onTagStart: () => {},
+      onTagStart: () => { },
       onTagEnd: async (_: string, input: Record<string, string>) => {
         if (signal.aborted) {
           return
@@ -157,10 +140,10 @@ export async function processStream(
         // Check if this is an agent tool call that should be transformed to spawn_agents
         const transformed = !isNativeTool
           ? tryTransformAgentToolCall({
-              toolName,
-              input,
-              spawnableAgents: agentTemplate.spawnableAgents,
-            })
+            toolName,
+            input,
+            spawnableAgents: agentTemplate.spawnableAgents,
+          })
           : null
 
         // Read previousToolCallFinished at execution time to ensure proper sequential chaining.
@@ -182,14 +165,16 @@ export async function processStream(
               : (toolName as ToolName),
             input: transformed ? transformed.input : input,
             fromHandleSteps: false,
-            skipDirectResultPush: isXmlMode,
+
             fileProcessingState,
             fullResponse: fullResponseChunks.join(''),
             previousToolCallFinished: previousPromise,
             toolCallId,
             toolCalls,
+            toolCallsToAddToMessageHistory,
             toolResults,
-            toolResultsToAddAfterStream: resultsArray,
+            toolResultsToAddToMessageHistory,
+            excludeToolFromMessageHistory: false,
             onCostCalculated,
             onResponseChunk: responseHandler,
           })
@@ -199,14 +184,16 @@ export async function processStream(
             ...params,
             toolName,
             input,
-            skipDirectResultPush: isXmlMode,
+
             fileProcessingState,
             fullResponse: fullResponseChunks.join(''),
             previousToolCallFinished: previousPromise,
             toolCallId,
             toolCalls,
+            toolCallsToAddToMessageHistory,
             toolResults,
-            toolResultsToAddAfterStream: resultsArray,
+            toolResultsToAddToMessageHistory,
+            excludeToolFromMessageHistory: false,
             onResponseChunk: responseHandler,
           })
         }
@@ -236,16 +223,6 @@ export async function processStream(
     ]),
     defaultProcessor: (name: string) =>
       createToolExecutionCallback(name, false),
-    onError: (toolName, error) => {
-      const toolResult: ToolMessage = {
-        role: 'tool',
-        toolName,
-        toolCallId: generateCompactId(),
-        content: jsonToolResult({ errorMessage: error }),
-      }
-      toolResults.push(cloneDeep(toolResult))
-      toolResultsToAddAfterStream.push(cloneDeep(toolResult))
-    },
     loggerOptions: {
       userId,
       model: agentTemplate.model,
@@ -327,20 +304,22 @@ export async function processStream(
     }
   }
 
-  // === FINALIZATION ===
-  agentState.messageHistory = buildArray<Message>([
-    ...expireMessages(agentState.messageHistory, 'agentStep'),
-    ...assistantMessages,
-    ...toolResultsToAddAfterStream,
-  ])
-
   if (!signal.aborted) {
     resolveStreamDonePromise()
     await previousToolCallFinished
   }
 
-  // Error messages must come AFTER tool results for proper API ordering
-  agentState.messageHistory.push(...errorMessages)
+  // === FINALIZATION ===
+  // Build message history from the current agentState.messageHistory so that
+  // inline agent modifications (e.g. set_messages) are preserved, while
+  // tool_calls and tool_results are still appended in deterministic order.
+  agentState.messageHistory = buildArray<Message>([
+    ...agentState.messageHistory,
+    ...assistantMessages,
+    ...toolCallsToAddToMessageHistory.map((toolCall) => assistantMessage({ ...toolCall, type: 'tool-call' })),
+    ...toolResultsToAddToMessageHistory,
+    ...errorMessages,
+  ])
 
   return {
     fullResponse: fullResponseChunks.join(''),
diff --git a/packages/agent-runtime/src/tools/tool-executor.ts b/packages/agent-runtime/src/tools/tool-executor.ts
index 02841f5b9..23d2e7880 100644
--- a/packages/agent-runtime/src/tools/tool-executor.ts
+++ b/packages/agent-runtime/src/tools/tool-executor.ts
@@ -33,7 +33,7 @@ import type { Logger } from '@codebuff/common/types/contracts/logger'
 import type { ToolMessage } from '@codebuff/common/types/messages/codebuff-message'
 import type { ToolResultOutput } from '@codebuff/common/types/messages/content-part'
 import type { PrintModeEvent } from '@codebuff/common/types/print-mode'
-import type { AgentTemplateType , AgentState, Subgoal } from '@codebuff/common/types/session-state'
+import type { AgentTemplateType, AgentState, Subgoal } from '@codebuff/common/types/session-state'
 import type {
   CustomToolDefinitions,
   ProjectFileContext,
@@ -119,9 +119,9 @@ export type ExecuteToolCallParams<T extends string = ToolName> = {
   tools: ToolSet
   toolCallId: string | undefined
   toolCalls: (CodebuffToolCall | CustomToolCall)[]
+  toolCallsToAddToMessageHistory: (CodebuffToolCall | CustomToolCall)[]
   toolResults: ToolMessage[]
-  toolResultsToAddAfterStream: ToolMessage[]
-  skipDirectResultPush?: boolean
+  toolResultsToAddToMessageHistory: ToolMessage[]
   userId: string | undefined
   userInputId: string
 
@@ -145,8 +145,9 @@ export async function executeToolCall<T extends ToolName>(
     logger,
     previousToolCallFinished,
     toolCalls,
+    toolCallsToAddToMessageHistory,
     toolResults,
-    toolResultsToAddAfterStream: _toolResultsToAddAfterStream,
+    toolResultsToAddToMessageHistory,
     userInputId,
 
     onCostCalculated,
@@ -299,8 +300,6 @@ export async function executeToolCall<T extends ToolName>(
     includeToolCall: !excludeToolFromMessageHistory,
   })
 
-  toolCalls.push(toolCall)
-
   // Cast to any to avoid type errors
   const handler = codebuffToolHandlers[
     toolName
@@ -312,6 +311,12 @@ export async function executeToolCall<T extends ToolName>(
       ? { ...toolCall, input: effectiveInput }
       : toolCall
 
+  toolCalls.push(finalToolCall)
+  if (!excludeToolFromMessageHistory) {
+    toolCallsToAddToMessageHistory.push(finalToolCall)
+  }
+
+
   const toolResultPromise = handler({
     ...params,
     toolCall: finalToolCall,
@@ -350,8 +355,8 @@ export async function executeToolCall<T extends ToolName>(
 
     toolResults.push(toolResult)
 
-    if (!excludeToolFromMessageHistory && !params.skipDirectResultPush) {
-      agentState.messageHistory.push(toolResult)
+    if (!excludeToolFromMessageHistory) {
+      toolResultsToAddToMessageHistory.push(toolResult)
     }
 
     // After tool completes, resolve any pending creditsUsed promise
@@ -449,8 +454,9 @@ export async function executeCustomToolCall(
     requestToolCall,
     toolCallId,
     toolCalls,
+    toolCallsToAddToMessageHistory,
     toolResults,
-    toolResultsToAddAfterStream: _toolResultsToAddAfterStream,
+    toolResultsToAddToMessageHistory,
     userInputId,
   } = params
   const toolCall: CustomToolCall | ToolCallError = parseRawCustomToolCall({
@@ -513,6 +519,9 @@ export async function executeCustomToolCall(
   })
 
   toolCalls.push(toolCall)
+  if (!excludeToolFromMessageHistory) {
+    toolCallsToAddToMessageHistory.push(toolCall)
+  }
 
   return previousToolCallFinished
     .then(async () => {
@@ -534,7 +543,7 @@ export async function executeCustomToolCall(
       return clientToolResult.output satisfies ToolResultOutput[]
     })
     .then((result) => {
-      if (result === null) {
+      if (!result) {
         return
       }
       const toolResult = {
@@ -547,10 +556,6 @@ export async function executeCustomToolCall(
         { input, toolResult },
         `${toolName} custom tool call & result (${toolResult.toolCallId})`,
       )
-      if (result === undefined) {
-        return
-      }
-
       onResponseChunk({
         type: 'tool_result',
         toolName: toolResult.toolName,
@@ -560,9 +565,10 @@ export async function executeCustomToolCall(
 
       toolResults.push(toolResult)
 
-      if (!excludeToolFromMessageHistory && !params.skipDirectResultPush) {
-        agentState.messageHistory.push(toolResult)
+      if (!excludeToolFromMessageHistory) {
+        toolResultsToAddToMessageHistory.push(toolResult)
       }
+
       return
     })
 }