From f3a8262a5f7a308ef58b3a8e51a259d90c97301d Mon Sep 17 00:00:00 2001
From: Feng GAO <gaofeng21cn@hotmail.com>
Date: Tue, 28 Apr 2026 18:05:43 +0800
Subject: [PATCH] diagnostics: classify provider runner failures

---
 src/deepscientist/daemon/app.py               |  73 ++++++++-
 .../diagnostics/runner_failures.py            |  86 +++++++++++
 tests/test_daemon_api.py                      | 141 ++++++++++++++++++
 tests/test_doctor.py                          |  97 ++++++++++++
 tests/test_runner_failure_diagnostics.py      |  51 +++++++
 5 files changed, 447 insertions(+), 1 deletion(-)
 create mode 100644 tests/test_runner_failure_diagnostics.py

diff --git a/src/deepscientist/daemon/app.py b/src/deepscientist/daemon/app.py
index 7507d443..882d40f3 100644
--- a/src/deepscientist/daemon/app.py
+++ b/src/deepscientist/daemon/app.py
@@ -3402,6 +3402,19 @@ def _run_quest_turn(self, quest_id: str) -> None:
                         )
                         return
                     exhausted_summary = f"{failure_summary} Retry budget exhausted after {attempt_index} attempt(s)."
+                    diagnosis = self._runner_failure_diagnosis(
+                        runner_name=runner_name,
+                        summary=exhausted_summary,
+                        stderr_text=str(exc),
+                        output_text="",
+                    )
+                    if diagnosis is not None and diagnosis.retriable:
+                        self.quest_service.update_runtime_state(
+                            quest_root=quest_root,
+                            continuation_policy="wait_for_user_or_resume",
+                            continuation_reason=self._retry_exhausted_continuation_reason(diagnosis),
+                            continuation_updated_at=utc_now(),
+                        )
                     self._append_retry_event(
                         quest_id,
                         event_type="runner.turn_retry_exhausted",
@@ -3414,6 +3427,7 @@ def _run_quest_turn(self, quest_id: str) -> None:
                         max_attempts=max_attempts,
                         summary=exhausted_summary,
                         failure_summary=failure_summary,
+                        diagnosis=diagnosis,
                     )
                     self._record_turn_error(
                         quest_id=quest_id,
@@ -3423,6 +3437,8 @@ def _run_quest_turn(self, quest_id: str) -> None:
                         model=model,
                         summary=exhausted_summary,
                         retry_state=None,
+                        diagnosis_code=diagnosis.code if diagnosis is not None else None,
+                        guidance=list(diagnosis.guidance) if diagnosis is not None else None,
                     )
                     return
 
@@ -3576,6 +3592,19 @@ def _run_quest_turn(self, quest_id: str) -> None:
                     return
 
                 exhausted_summary = f"{failure_summary} Retry budget exhausted after {attempt_index} attempt(s)."
+                diagnosis = self._runner_failure_diagnosis(
+                    runner_name=runner_name,
+                    summary=exhausted_summary,
+                    stderr_text=result.stderr_text,
+                    output_text=result.output_text,
+                )
+                if diagnosis is not None and diagnosis.retriable:
+                    self.quest_service.update_runtime_state(
+                        quest_root=quest_root,
+                        continuation_policy="wait_for_user_or_resume",
+                        continuation_reason=self._retry_exhausted_continuation_reason(diagnosis),
+                        continuation_updated_at=utc_now(),
+                    )
                 self._append_retry_event(
                     quest_id,
                     event_type="runner.turn_retry_exhausted",
@@ -3588,6 +3617,7 @@ def _run_quest_turn(self, quest_id: str) -> None:
                     max_attempts=max_attempts,
                     summary=exhausted_summary,
                     failure_summary=failure_summary,
+                    diagnosis=diagnosis,
                 )
                 self._record_turn_error(
                     quest_id=quest_id,
@@ -3597,6 +3627,8 @@ def _run_quest_turn(self, quest_id: str) -> None:
                     model=model,
                     summary=exhausted_summary,
                     retry_state=None,
+                    diagnosis_code=diagnosis.code if diagnosis is not None else None,
+                    guidance=list(diagnosis.guidance) if diagnosis is not None else None,
                 )
                 return
             finally:
@@ -4035,6 +4067,7 @@ def _append_retry_event(
         backoff_seconds: float | None = None,
         next_attempt_index: int | None = None,
         previous_run_id: str | None = None,
+        diagnosis: FailureDiagnosis | None = None,
     ) -> dict[str, Any]:
         payload = {
             "event_id": generate_id("evt"),
@@ -4058,6 +4091,9 @@ def _append_retry_event(
             payload["next_attempt_index"] = next_attempt_index
         if previous_run_id:
             payload["previous_run_id"] = previous_run_id
+        if diagnosis is not None:
+            payload["diagnosis_code"] = diagnosis.code
+            payload["diagnosis"] = self._failure_diagnosis_payload(diagnosis)
         append_jsonl(self.home / "quests" / quest_id / ".ds" / "events.jsonl", payload)
         self.logger.log(
             "warning" if "scheduled" in event_type or "exhausted" in event_type else "info",
@@ -4071,6 +4107,7 @@ def _append_retry_event(
             backoff_seconds=backoff_seconds,
             next_attempt_index=next_attempt_index,
             previous_run_id=previous_run_id,
+            diagnosis_code=diagnosis.code if diagnosis is not None else None,
         )
         return payload
 
@@ -4262,6 +4299,34 @@ def _record_turn_error(
             ],
         )
 
+    @staticmethod
+    def _failure_diagnosis_payload(diagnosis: FailureDiagnosis) -> dict[str, Any]:
+        fix = [str(line) for line in diagnosis.guidance if str(line).strip()]
+        return {
+            "code": diagnosis.code,
+            "problem": diagnosis.problem,
+            "why": diagnosis.why,
+            "fix": fix,
+            "guidance": fix,
+            "retriable": bool(diagnosis.retriable),
+            "matched_text": diagnosis.matched_text,
+        }
+
+    @staticmethod
+    def _runner_failure_diagnosis(
+        *,
+        runner_name: str,
+        summary: str,
+        stderr_text: str,
+        output_text: str,
+    ) -> FailureDiagnosis | None:
+        return diagnose_runner_failure(
+            runner_name=runner_name,
+            summary=summary,
+            stderr_text=stderr_text,
+            output_text=output_text,
+        )
+
     @staticmethod
     def _non_retryable_failure_diagnosis(
         *,
@@ -4270,7 +4335,7 @@ def _non_retryable_failure_diagnosis(
         stderr_text: str,
         output_text: str,
     ) -> FailureDiagnosis | None:
-        diagnosis = diagnose_runner_failure(
+        diagnosis = DaemonApp._runner_failure_diagnosis(
             runner_name=runner_name,
             summary=summary,
             stderr_text=stderr_text,
@@ -4280,6 +4345,12 @@ def _non_retryable_failure_diagnosis(
             return None
         return diagnosis
 
+    @staticmethod
+    def _retry_exhausted_continuation_reason(diagnosis: FailureDiagnosis) -> str:
+        if diagnosis.code == "codex_upstream_provider_error":
+            return "external_codex_upstream_provider_error"
+        return "runner_retry_budget_exhausted"
+
     def _record_turn_postprocess_warning(
         self,
         *,
diff --git a/src/deepscientist/diagnostics/runner_failures.py b/src/deepscientist/diagnostics/runner_failures.py
index 5474c720..f2b38fde 100644
--- a/src/deepscientist/diagnostics/runner_failures.py
+++ b/src/deepscientist/diagnostics/runner_failures.py
@@ -26,6 +26,46 @@ class FailureDiagnosis:
     "unrecognized model",
 )
 
+_CODEX_PROVIDER_ACCOUNT_ERROR_MARKERS = (
+    "account balance is negative",
+    "please recharge",
+    "insufficient quota",
+    "quota exceeded",
+    "billing hard limit",
+    "billing limit",
+    "payment required",
+    "invalid api key",
+    "invalid_api_key",
+    "incorrect api key",
+    "api key is invalid",
+    "unauthorized",
+)
+
+_CODEX_PROVIDER_ACCOUNT_STATUS_MARKERS = (
+    "401 unauthorized",
+    "402 payment required",
+    "403 forbidden",
+)
+
+_CODEX_UPSTREAM_ERROR_MARKERS = (
+    "rate limit",
+    "too many requests",
+    "service unavailable",
+    "bad gateway",
+    "gateway timeout",
+    "internal server error",
+    "temporarily unavailable",
+    "server overloaded",
+)
+
+_CODEX_UPSTREAM_STATUS_MARKERS = (
+    "429 too many requests",
+    "500 internal server error",
+    "502 bad gateway",
+    "503 service unavailable",
+    "504 gateway timeout",
+)
+
 
 def _build_haystack(*values: object) -> str:
     return "\n".join(str(value or "") for value in values if str(value or "").strip())
@@ -46,6 +86,52 @@ def diagnose_runner_failure(
     lower = haystack.lower()
     normalized_runner = str(runner_name or "").strip().lower()
 
+    if normalized_runner == "codex" and (
+        any(marker in lower for marker in _CODEX_PROVIDER_ACCOUNT_ERROR_MARKERS)
+        or (
+            ("unexpected status" in lower or "http_code" in lower or "status" in lower)
+            and any(marker in lower for marker in _CODEX_PROVIDER_ACCOUNT_STATUS_MARKERS)
+        )
+    ):
+        return FailureDiagnosis(
+            code="codex_provider_account_error",
+            problem="The configured Codex provider account cannot serve the request.",
+            why=(
+                "The provider reported an account, billing, quota, or credential blocker. "
+                "Repeating the same quest turn will keep failing until the provider account state is corrected."
+            ),
+            guidance=(
+                "Check the configured provider account, quota, billing status, credentials, and API-key scope.",
+                "Verify the same Codex profile works outside DeepScientist before resuming the quest.",
+                "Do not relaunch the same quest repeatedly until the provider account state is healthy.",
+            ),
+            retriable=False,
+            matched_text="codex provider account error",
+        )
+
+    if normalized_runner == "codex" and (
+        any(marker in lower for marker in _CODEX_UPSTREAM_ERROR_MARKERS)
+        or (
+            ("unexpected status" in lower or "http_code" in lower or "status" in lower)
+            and any(marker in lower for marker in _CODEX_UPSTREAM_STATUS_MARKERS)
+        )
+    ):
+        return FailureDiagnosis(
+            code="codex_upstream_provider_error",
+            problem="The configured Codex upstream provider rejected or could not serve the request.",
+            why=(
+                "This is an external provider/API service condition. DeepScientist can retry with backoff, "
+                "but it cannot repair upstream provider availability from inside the quest runtime."
+            ),
+            guidance=(
+                "Check the configured provider service health, rate limits, and API status.",
+                "Verify the same Codex profile works outside DeepScientist before resuming the quest.",
+                "Do not repeatedly relaunch the same quest if the provider continues returning the same upstream error.",
+            ),
+            retriable=True,
+            matched_text="codex upstream provider error",
+        )
+
     if (
         "tool call result does not follow tool call (2013)" in lower
         or "tool result's tool id" in lower
diff --git a/tests/test_daemon_api.py b/tests/test_daemon_api.py
index f23d380b..d451c7e9 100644
--- a/tests/test_daemon_api.py
+++ b/tests/test_daemon_api.py
@@ -7369,6 +7369,147 @@ def run(self, request):
     )
 
 
+def test_daemon_retry_exhaustion_records_provider_diagnosis_payload(temp_home: Path) -> None:
+    ensure_home_layout(temp_home)
+    ConfigManager(temp_home).ensure_files()
+    app = DaemonApp(temp_home)
+    app.runners_config["codex"].update(
+        {
+            "retry_on_failure": True,
+            "retry_max_attempts": 2,
+            "retry_initial_backoff_sec": 0,
+            "retry_backoff_multiplier": 2,
+            "retry_max_backoff_sec": 0,
+        }
+    )
+    quest = app.quest_service.create("retry exhausted provider diagnosis quest")
+    quest_id = quest["quest_id"]
+
+    class TransientProviderFailRunner:
+        binary = ""
+
+        def __init__(self) -> None:
+            self.requests = []
+
+        def run(self, request):
+            self.requests.append(request)
+            history_root = ensure_dir(request.quest_root / ".ds" / "codex_history" / request.run_id)
+            run_root = ensure_dir(request.quest_root / ".ds" / "runs" / request.run_id)
+            return RunResult(
+                ok=False,
+                run_id=request.run_id,
+                model=request.model,
+                output_text="unexpected status 503 Service Unavailable from upstream provider",
+                exit_code=1,
+                history_root=history_root,
+                run_root=run_root,
+                stderr_text="",
+            )
+
+    runner = TransientProviderFailRunner()
+    app.runners["codex"] = runner
+
+    payload = app.handlers.chat(quest_id, {"text": "Please continue.", "source": "tui-ink"})
+    assert payload["ok"] is True
+
+    deadline = time.time() + 5
+    while time.time() < deadline:
+        snapshot = app.quest_service.snapshot(quest_id)
+        events = read_jsonl(Path(quest["quest_root"]) / ".ds" / "events.jsonl")
+        if (
+            any(item.get("type") == "runner.turn_error" for item in events)
+            and snapshot.get("retry_state") is None
+            and str(snapshot.get("display_status") or "").strip() == "error"
+        ):
+            break
+        time.sleep(0.05)
+    else:
+        raise AssertionError("provider failure did not settle after retry budget exhaustion")
+
+    snapshot = app.quest_service.snapshot(quest_id)
+    events = read_jsonl(Path(quest["quest_root"]) / ".ds" / "events.jsonl")
+    retry_exhausted = [item for item in events if item.get("type") == "runner.turn_retry_exhausted"]
+    turn_errors = [item for item in events if item.get("type") == "runner.turn_error"]
+
+    assert len(runner.requests) == 2
+    assert snapshot["continuation_policy"] == "wait_for_user_or_resume"
+    assert snapshot["continuation_reason"] == "external_codex_upstream_provider_error"
+    assert retry_exhausted
+    diagnosis = retry_exhausted[-1].get("diagnosis")
+    assert diagnosis["code"] == "codex_upstream_provider_error"
+    assert diagnosis["problem"]
+    assert diagnosis["why"]
+    assert any("provider" in line.lower() for line in diagnosis["fix"])
+    assert retry_exhausted[-1]["diagnosis_code"] == "codex_upstream_provider_error"
+    assert turn_errors[-1]["diagnosis_code"] == "codex_upstream_provider_error"
+
+
+def test_daemon_stops_retry_for_provider_account_blocker(temp_home: Path) -> None:
+    ensure_home_layout(temp_home)
+    ConfigManager(temp_home).ensure_files()
+    app = DaemonApp(temp_home)
+    app.runners_config["codex"].update(
+        {
+            "retry_on_failure": True,
+            "retry_max_attempts": 5,
+            "retry_initial_backoff_sec": 0,
+            "retry_backoff_multiplier": 2,
+            "retry_max_backoff_sec": 0,
+        }
+    )
+    quest = app.quest_service.create("provider account blocker quest")
+    quest_id = quest["quest_id"]
+
+    class AccountBlockerRunner:
+        binary = ""
+
+        def __init__(self) -> None:
+            self.requests = []
+
+        def run(self, request):
+            self.requests.append(request)
+            history_root = ensure_dir(request.quest_root / ".ds" / "codex_history" / request.run_id)
+            run_root = ensure_dir(request.quest_root / ".ds" / "runs" / request.run_id)
+            return RunResult(
+                ok=False,
+                run_id=request.run_id,
+                model=request.model,
+                output_text="unexpected status 403 Forbidden: account balance is negative, please recharge first",
+                exit_code=1,
+                history_root=history_root,
+                run_root=run_root,
+                stderr_text="",
+            )
+
+    runner = AccountBlockerRunner()
+    app.runners["codex"] = runner
+
+    payload = app.handlers.chat(quest_id, {"text": "Please continue.", "source": "tui-ink"})
+    assert payload["ok"] is True
+
+    deadline = time.time() + 5
+    while time.time() < deadline:
+        snapshot = app.quest_service.snapshot(quest_id)
+        events = read_jsonl(Path(quest["quest_root"]) / ".ds" / "events.jsonl")
+        if any(item.get("type") == "runner.turn_error" for item in events):
+            if snapshot.get("retry_state") is None and str(snapshot.get("display_status") or "").strip() == "error":
+                break
+        time.sleep(0.05)
+    else:
+        raise AssertionError("provider account blocker did not settle into an immediate error state")
+
+    snapshot = app.quest_service.snapshot(quest_id)
+    events = read_jsonl(Path(quest["quest_root"]) / ".ds" / "events.jsonl")
+    turn_errors = [item for item in events if item.get("type") == "runner.turn_error"]
+
+    assert len(runner.requests) == 1
+    assert snapshot["retry_state"] is None
+    assert snapshot["continuation_policy"] == "wait_for_user_or_resume"
+    assert snapshot["continuation_reason"] == "non_retryable_runner_error"
+    assert not any(item.get("type") == "runner.turn_retry_scheduled" for item in events)
+    assert turn_errors[-1]["diagnosis_code"] == "codex_provider_account_error"
+
+
 def test_daemon_skips_retry_for_non_retryable_minimax_protocol_error(temp_home: Path) -> None:
     ensure_home_layout(temp_home)
     ConfigManager(temp_home).ensure_files()
diff --git a/tests/test_doctor.py b/tests/test_doctor.py
index 0f577c37..dd31c536 100644
--- a/tests/test_doctor.py
+++ b/tests/test_doctor.py
@@ -13,6 +13,55 @@
 from deepscientist.shared import append_jsonl, ensure_dir, utc_now, write_json
 
 
+def _stub_ready_doctor_environment(monkeypatch) -> None:  # type: ignore[no-untyped-def]
+    monkeypatch.setattr("deepscientist.doctor.resolve_runner_binary", lambda binary, runner_name=None: "/usr/bin/codex")
+    monkeypatch.setattr("deepscientist.doctor._query_local_health", lambda url: None)
+    monkeypatch.setattr("deepscientist.doctor._port_is_bindable", lambda host, port: (True, None))
+    monkeypatch.setattr(
+        "deepscientist.doctor._check_bundles",
+        lambda root: {
+            "id": "bundles",
+            "label": "UI bundles",
+            "ok": True,
+            "status": "ok",
+            "summary": "Web and TUI bundles are present.",
+            "warnings": [],
+            "errors": [],
+            "guidance": [],
+            "details": {},
+        },
+    )
+    monkeypatch.setattr("deepscientist.doctor.which", lambda name: "/usr/bin/uv" if name == "uv" else None)
+    monkeypatch.setattr(
+        "deepscientist.doctor.subprocess.run",
+        lambda *args, **kwargs: SimpleNamespace(returncode=0, stdout="uv 0.9.2\n", stderr=""),
+    )
+    monkeypatch.setattr(
+        ConfigManager,
+        "git_readiness",
+        lambda self: {
+            "ok": True,
+            "installed": True,
+            "user_name": "Deep Scientist",
+            "user_email": "deep@example.com",
+            "warnings": [],
+            "errors": [],
+            "guidance": [],
+        },
+    )
+    monkeypatch.setattr(
+        ConfigManager,
+        "probe_codex_bootstrap",
+        lambda self, *, persist=False, payload=None: {
+            "ok": True,
+            "summary": "Codex startup probe completed.",
+            "warnings": [],
+            "errors": [],
+            "guidance": [],
+        },
+    )
+
+
 def test_cli_parser_exposes_doctor_and_removes_metrics() -> None:
     parser = build_parser()
 
@@ -270,6 +319,54 @@ def test_doctor_reports_recent_runtime_failure_with_problem_why_fix(monkeypatch,
     assert f"evidence: quest: {quest['quest_id']}" in rendered
 
 
+def test_doctor_reports_provider_account_runtime_failure_with_problem_why_fix(monkeypatch, temp_home: Path) -> None:
+    ensure_home_layout(temp_home)
+    manager = ConfigManager(temp_home)
+    manager.ensure_files()
+    quest = QuestService(temp_home).create("doctor provider account diagnosis quest")
+    quest_root = Path(quest["quest_root"])
+    run_id = "run-provider-account-001"
+    run_root = ensure_dir(quest_root / ".ds" / "runs" / run_id)
+    write_json(
+        run_root / "result.json",
+        {
+            "ok": False,
+            "run_id": run_id,
+            "model": "provider-default",
+            "exit_code": 1,
+            "output_text": "unexpected status 402 Payment Required: insufficient quota",
+            "stderr_text": "",
+            "completed_at": utc_now(),
+        },
+    )
+    append_jsonl(
+        quest_root / ".ds" / "events.jsonl",
+        {
+            "event_id": "evt-provider-account-001",
+            "type": "runner.turn_error",
+            "quest_id": quest["quest_id"],
+            "run_id": run_id,
+            "source": "codex",
+            "skill_id": "baseline",
+            "model": "provider-default",
+            "summary": "Runner failed after provider rejected the account quota.",
+            "created_at": utc_now(),
+        },
+    )
+    _stub_ready_doctor_environment(monkeypatch)
+
+    report = run_doctor(temp_home, repo_root=repo_root())
+    rendered = render_doctor_report(report)
+    runtime_check = next(item for item in report["checks"] if item["id"] == "recent_runtime_failures")
+
+    assert runtime_check["status"] == "warn"
+    assert runtime_check["problem"] == "The configured Codex provider account cannot serve the request."
+    assert "account, billing, quota" in str(runtime_check["why"])
+    assert any("quota" in line.lower() for line in runtime_check["fix"])
+    assert "problem: The configured Codex provider account cannot serve the request." in rendered
+    assert "fix: Check the configured provider account" in rendered
+
+
 def test_doctor_surfaces_probe_diagnosis_for_known_tool_argument_error(monkeypatch, temp_home: Path) -> None:
     ensure_home_layout(temp_home)
     manager = ConfigManager(temp_home)
diff --git a/tests/test_runner_failure_diagnostics.py b/tests/test_runner_failure_diagnostics.py
new file mode 100644
index 00000000..5712dd86
--- /dev/null
+++ b/tests/test_runner_failure_diagnostics.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+import pytest
+
+from deepscientist.diagnostics import diagnose_runner_failure
+
+
+@pytest.mark.parametrize(
+    "message",
+    [
+        "unexpected status 403 Forbidden: account balance is negative, please recharge first",
+        "unexpected status 402 Payment Required: insufficient quota",
+        '{"error":{"message":"billing hard limit has been reached","http_code":"403"}}',
+        "401 Unauthorized: invalid api key for the configured provider account",
+    ],
+)
+def test_codex_provider_account_errors_are_non_retryable_blockers(message: str) -> None:
+    diagnosis = diagnose_runner_failure(runner_name="codex", output_text=message)
+
+    assert diagnosis is not None
+    assert diagnosis.code == "codex_provider_account_error"
+    assert diagnosis.retriable is False
+    assert "account" in diagnosis.problem.lower()
+
+
+@pytest.mark.parametrize(
+    "message",
+    [
+        "unexpected status 429 Too Many Requests: rate limit exceeded",
+        "unexpected status 503 Service Unavailable",
+        '{"error":{"message":"502 Bad Gateway from upstream provider","http_code":"502"}}',
+        "504 gateway timeout from upstream model provider",
+    ],
+)
+def test_codex_upstream_provider_errors_are_retryable_external_blockers(message: str) -> None:
+    diagnosis = diagnose_runner_failure(runner_name="codex", output_text=message)
+
+    assert diagnosis is not None
+    assert diagnosis.code == "codex_upstream_provider_error"
+    assert diagnosis.retriable is True
+
+
+def test_codex_bad_request_protocol_errors_stay_non_retryable_local_diagnostics() -> None:
+    diagnosis = diagnose_runner_failure(
+        runner_name="codex",
+        stderr_text='{"type":"error","error":{"type":"bad_request_error","message":"invalid params, tool call result does not follow tool call (2013)","http_code":"400"}}',
+    )
+
+    assert diagnosis is not None
+    assert diagnosis.code == "minimax_tool_result_sequence_error"
+    assert diagnosis.retriable is False