ResearAI · gaofeng21cn · Apr 28, 2026
diff --git a/src/deepscientist/daemon/app.py b/src/deepscientist/daemon/app.py
@@ -3402,6 +3402,19 @@ def _run_quest_turn(self, quest_id: str) -> None:
                         )
                         return
                     exhausted_summary = f"{failure_summary} Retry budget exhausted after {attempt_index} attempt(s)."
+                    diagnosis = self._runner_failure_diagnosis(
+                        runner_name=runner_name,
+                        summary=exhausted_summary,
+                        stderr_text=str(exc),
+                        output_text="",
+                    )
+                    if diagnosis is not None and diagnosis.retriable:
+                        self.quest_service.update_runtime_state(
+                            quest_root=quest_root,
+                            continuation_policy="wait_for_user_or_resume",
+                            continuation_reason=self._retry_exhausted_continuation_reason(diagnosis),
+                            continuation_updated_at=utc_now(),
+                        )
                     self._append_retry_event(
                         quest_id,
                         event_type="runner.turn_retry_exhausted",
@@ -3414,6 +3427,7 @@ def _run_quest_turn(self, quest_id: str) -> None:
                         max_attempts=max_attempts,
                         summary=exhausted_summary,
                         failure_summary=failure_summary,
+                        diagnosis=diagnosis,
                     )
                     self._record_turn_error(
                         quest_id=quest_id,
@@ -3423,6 +3437,8 @@ def _run_quest_turn(self, quest_id: str) -> None:
                         model=model,
                         summary=exhausted_summary,
                         retry_state=None,
+                        diagnosis_code=diagnosis.code if diagnosis is not None else None,
+                        guidance=list(diagnosis.guidance) if diagnosis is not None else None,
                     )
                     return
 
@@ -3576,6 +3592,19 @@ def _run_quest_turn(self, quest_id: str) -> None:
                     return
 
                 exhausted_summary = f"{failure_summary} Retry budget exhausted after {attempt_index} attempt(s)."
+                diagnosis = self._runner_failure_diagnosis(
+                    runner_name=runner_name,
+                    summary=exhausted_summary,
+                    stderr_text=result.stderr_text,
+                    output_text=result.output_text,
+                )
+                if diagnosis is not None and diagnosis.retriable:
+                    self.quest_service.update_runtime_state(
+                        quest_root=quest_root,
+                        continuation_policy="wait_for_user_or_resume",
+                        continuation_reason=self._retry_exhausted_continuation_reason(diagnosis),
+                        continuation_updated_at=utc_now(),
+                    )
                 self._append_retry_event(
                     quest_id,
                     event_type="runner.turn_retry_exhausted",
@@ -3588,6 +3617,7 @@ def _run_quest_turn(self, quest_id: str) -> None:
                     max_attempts=max_attempts,
                     summary=exhausted_summary,
                     failure_summary=failure_summary,
+                    diagnosis=diagnosis,
                 )
                 self._record_turn_error(
                     quest_id=quest_id,
@@ -3597,6 +3627,8 @@ def _run_quest_turn(self, quest_id: str) -> None:
                     model=model,
                     summary=exhausted_summary,
                     retry_state=None,
+                    diagnosis_code=diagnosis.code if diagnosis is not None else None,
+                    guidance=list(diagnosis.guidance) if diagnosis is not None else None,
                 )
                 return
             finally:
@@ -4035,6 +4067,7 @@ def _append_retry_event(
         backoff_seconds: float | None = None,
         next_attempt_index: int | None = None,
         previous_run_id: str | None = None,
+        diagnosis: FailureDiagnosis | None = None,
     ) -> dict[str, Any]:
         payload = {
             "event_id": generate_id("evt"),
@@ -4058,6 +4091,9 @@ def _append_retry_event(
             payload["next_attempt_index"] = next_attempt_index
         if previous_run_id:
             payload["previous_run_id"] = previous_run_id
+        if diagnosis is not None:
+            payload["diagnosis_code"] = diagnosis.code
+            payload["diagnosis"] = self._failure_diagnosis_payload(diagnosis)
         append_jsonl(self.home / "quests" / quest_id / ".ds" / "events.jsonl", payload)
         self.logger.log(
             "warning" if "scheduled" in event_type or "exhausted" in event_type else "info",
@@ -4071,6 +4107,7 @@ def _append_retry_event(
             backoff_seconds=backoff_seconds,
             next_attempt_index=next_attempt_index,
             previous_run_id=previous_run_id,
+            diagnosis_code=diagnosis.code if diagnosis is not None else None,
         )
         return payload
 
@@ -4262,6 +4299,34 @@ def _record_turn_error(
             ],
         )
 
+    @staticmethod
+    def _failure_diagnosis_payload(diagnosis: FailureDiagnosis) -> dict[str, Any]:
+        fix = [str(line) for line in diagnosis.guidance if str(line).strip()]
+        return {
+            "code": diagnosis.code,
+            "problem": diagnosis.problem,
+            "why": diagnosis.why,
+            "fix": fix,
+            "guidance": fix,
+            "retriable": bool(diagnosis.retriable),
+            "matched_text": diagnosis.matched_text,
+        }
+
+    @staticmethod
+    def _runner_failure_diagnosis(
+        *,
+        runner_name: str,
+        summary: str,
+        stderr_text: str,
+        output_text: str,
+    ) -> FailureDiagnosis | None:
+        return diagnose_runner_failure(
+            runner_name=runner_name,
+            summary=summary,
+            stderr_text=stderr_text,
+            output_text=output_text,
+        )
+
     @staticmethod
     def _non_retryable_failure_diagnosis(
         *,
@@ -4270,7 +4335,7 @@ def _non_retryable_failure_diagnosis(
         stderr_text: str,
         output_text: str,
     ) -> FailureDiagnosis | None:
-        diagnosis = diagnose_runner_failure(
+        diagnosis = DaemonApp._runner_failure_diagnosis(
             runner_name=runner_name,
             summary=summary,
             stderr_text=stderr_text,
@@ -4280,6 +4345,12 @@ def _non_retryable_failure_diagnosis(
             return None
         return diagnosis
 
+    @staticmethod
+    def _retry_exhausted_continuation_reason(diagnosis: FailureDiagnosis) -> str:
+        if diagnosis.code == "codex_upstream_provider_error":
+            return "external_codex_upstream_provider_error"
+        return "runner_retry_budget_exhausted"
+
     def _record_turn_postprocess_warning(
         self,
         *,

diff --git a/src/deepscientist/diagnostics/runner_failures.py b/src/deepscientist/diagnostics/runner_failures.py
@@ -26,6 +26,46 @@ class FailureDiagnosis:
     "unrecognized model",
 )
 
+_CODEX_PROVIDER_ACCOUNT_ERROR_MARKERS = (
+    "account balance is negative",
+    "please recharge",
+    "insufficient quota",
+    "quota exceeded",
+    "billing hard limit",
+    "billing limit",
+    "payment required",
+    "invalid api key",
+    "invalid_api_key",
+    "incorrect api key",
+    "api key is invalid",
+    "unauthorized",
+)
+
+_CODEX_PROVIDER_ACCOUNT_STATUS_MARKERS = (
+    "401 unauthorized",
+    "402 payment required",
+    "403 forbidden",
+)
+
+_CODEX_UPSTREAM_ERROR_MARKERS = (
+    "rate limit",
+    "too many requests",
+    "service unavailable",
+    "bad gateway",
+    "gateway timeout",
+    "internal server error",
+    "temporarily unavailable",
+    "server overloaded",
+)
+
+_CODEX_UPSTREAM_STATUS_MARKERS = (
+    "429 too many requests",
+    "500 internal server error",
+    "502 bad gateway",
+    "503 service unavailable",
+    "504 gateway timeout",
+)
+
 
 def _build_haystack(*values: object) -> str:
     return "\n".join(str(value or "") for value in values if str(value or "").strip())
@@ -46,6 +86,52 @@ def diagnose_runner_failure(
     lower = haystack.lower()
     normalized_runner = str(runner_name or "").strip().lower()
 
+    if normalized_runner == "codex" and (
+        any(marker in lower for marker in _CODEX_PROVIDER_ACCOUNT_ERROR_MARKERS)
+        or (
+            ("unexpected status" in lower or "http_code" in lower or "status" in lower)
+            and any(marker in lower for marker in _CODEX_PROVIDER_ACCOUNT_STATUS_MARKERS)
+        )
+    ):
+        return FailureDiagnosis(
+            code="codex_provider_account_error",
+            problem="The configured Codex provider account cannot serve the request.",
+            why=(
+                "The provider reported an account, billing, quota, or credential blocker. "
+                "Repeating the same quest turn will keep failing until the provider account state is corrected."
+            ),
+            guidance=(
+                "Check the configured provider account, quota, billing status, credentials, and API-key scope.",
+                "Verify the same Codex profile works outside DeepScientist before resuming the quest.",
+                "Do not relaunch the same quest repeatedly until the provider account state is healthy.",
+            ),
+            retriable=False,
+            matched_text="codex provider account error",
+        )
+
+    if normalized_runner == "codex" and (
+        any(marker in lower for marker in _CODEX_UPSTREAM_ERROR_MARKERS)
+        or (
+            ("unexpected status" in lower or "http_code" in lower or "status" in lower)
+            and any(marker in lower for marker in _CODEX_UPSTREAM_STATUS_MARKERS)
+        )
+    ):
+        return FailureDiagnosis(
+            code="codex_upstream_provider_error",
+            problem="The configured Codex upstream provider rejected or could not serve the request.",
+            why=(
+                "This is an external provider/API service condition. DeepScientist can retry with backoff, "
+                "but it cannot repair upstream provider availability from inside the quest runtime."
+            ),
+            guidance=(
+                "Check the configured provider service health, rate limits, and API status.",
+                "Verify the same Codex profile works outside DeepScientist before resuming the quest.",
+                "Do not repeatedly relaunch the same quest if the provider continues returning the same upstream error.",
+            ),
+            retriable=True,
+            matched_text="codex upstream provider error",
+        )
+
     if (
         "tool call result does not follow tool call (2013)" in lower
         or "tool result's tool id" in lower