From f3a8262a5f7a308ef58b3a8e51a259d90c97301d Mon Sep 17 00:00:00 2001 From: Feng GAO Date: Tue, 28 Apr 2026 18:05:43 +0800 Subject: [PATCH] diagnostics: classify provider runner failures --- src/deepscientist/daemon/app.py | 73 ++++++++- .../diagnostics/runner_failures.py | 86 +++++++++++ tests/test_daemon_api.py | 141 ++++++++++++++++++ tests/test_doctor.py | 97 ++++++++++++ tests/test_runner_failure_diagnostics.py | 51 +++++++ 5 files changed, 447 insertions(+), 1 deletion(-) create mode 100644 tests/test_runner_failure_diagnostics.py diff --git a/src/deepscientist/daemon/app.py b/src/deepscientist/daemon/app.py index 7507d443..882d40f3 100644 --- a/src/deepscientist/daemon/app.py +++ b/src/deepscientist/daemon/app.py @@ -3402,6 +3402,19 @@ def _run_quest_turn(self, quest_id: str) -> None: ) return exhausted_summary = f"{failure_summary} Retry budget exhausted after {attempt_index} attempt(s)." + diagnosis = self._runner_failure_diagnosis( + runner_name=runner_name, + summary=exhausted_summary, + stderr_text=str(exc), + output_text="", + ) + if diagnosis is not None and diagnosis.retriable: + self.quest_service.update_runtime_state( + quest_root=quest_root, + continuation_policy="wait_for_user_or_resume", + continuation_reason=self._retry_exhausted_continuation_reason(diagnosis), + continuation_updated_at=utc_now(), + ) self._append_retry_event( quest_id, event_type="runner.turn_retry_exhausted", @@ -3414,6 +3427,7 @@ def _run_quest_turn(self, quest_id: str) -> None: max_attempts=max_attempts, summary=exhausted_summary, failure_summary=failure_summary, + diagnosis=diagnosis, ) self._record_turn_error( quest_id=quest_id, @@ -3423,6 +3437,8 @@ def _run_quest_turn(self, quest_id: str) -> None: model=model, summary=exhausted_summary, retry_state=None, + diagnosis_code=diagnosis.code if diagnosis is not None else None, + guidance=list(diagnosis.guidance) if diagnosis is not None else None, ) return @@ -3576,6 +3592,19 @@ def _run_quest_turn(self, quest_id: str) -> None: return exhausted_summary = f"{failure_summary} Retry budget exhausted after {attempt_index} attempt(s)." + diagnosis = self._runner_failure_diagnosis( + runner_name=runner_name, + summary=exhausted_summary, + stderr_text=result.stderr_text, + output_text=result.output_text, + ) + if diagnosis is not None and diagnosis.retriable: + self.quest_service.update_runtime_state( + quest_root=quest_root, + continuation_policy="wait_for_user_or_resume", + continuation_reason=self._retry_exhausted_continuation_reason(diagnosis), + continuation_updated_at=utc_now(), + ) self._append_retry_event( quest_id, event_type="runner.turn_retry_exhausted", @@ -3588,6 +3617,7 @@ def _run_quest_turn(self, quest_id: str) -> None: max_attempts=max_attempts, summary=exhausted_summary, failure_summary=failure_summary, + diagnosis=diagnosis, ) self._record_turn_error( quest_id=quest_id, @@ -3597,6 +3627,8 @@ def _run_quest_turn(self, quest_id: str) -> None: model=model, summary=exhausted_summary, retry_state=None, + diagnosis_code=diagnosis.code if diagnosis is not None else None, + guidance=list(diagnosis.guidance) if diagnosis is not None else None, ) return finally: @@ -4035,6 +4067,7 @@ def _append_retry_event( backoff_seconds: float | None = None, next_attempt_index: int | None = None, previous_run_id: str | None = None, + diagnosis: FailureDiagnosis | None = None, ) -> dict[str, Any]: payload = { "event_id": generate_id("evt"), @@ -4058,6 +4091,9 @@ def _append_retry_event( payload["next_attempt_index"] = next_attempt_index if previous_run_id: payload["previous_run_id"] = previous_run_id + if diagnosis is not None: + payload["diagnosis_code"] = diagnosis.code + payload["diagnosis"] = self._failure_diagnosis_payload(diagnosis) append_jsonl(self.home / "quests" / quest_id / ".ds" / "events.jsonl", payload) self.logger.log( "warning" if "scheduled" in event_type or "exhausted" in event_type else "info", @@ -4071,6 +4107,7 @@ def _append_retry_event( backoff_seconds=backoff_seconds, next_attempt_index=next_attempt_index, previous_run_id=previous_run_id, + diagnosis_code=diagnosis.code if diagnosis is not None else None, ) return payload @@ -4262,6 +4299,34 @@ def _record_turn_error( ], ) + @staticmethod + def _failure_diagnosis_payload(diagnosis: FailureDiagnosis) -> dict[str, Any]: + fix = [str(line) for line in diagnosis.guidance if str(line).strip()] + return { + "code": diagnosis.code, + "problem": diagnosis.problem, + "why": diagnosis.why, + "fix": fix, + "guidance": fix, + "retriable": bool(diagnosis.retriable), + "matched_text": diagnosis.matched_text, + } + + @staticmethod + def _runner_failure_diagnosis( + *, + runner_name: str, + summary: str, + stderr_text: str, + output_text: str, + ) -> FailureDiagnosis | None: + return diagnose_runner_failure( + runner_name=runner_name, + summary=summary, + stderr_text=stderr_text, + output_text=output_text, + ) + @staticmethod def _non_retryable_failure_diagnosis( *, @@ -4270,7 +4335,7 @@ def _non_retryable_failure_diagnosis( stderr_text: str, output_text: str, ) -> FailureDiagnosis | None: - diagnosis = diagnose_runner_failure( + diagnosis = DaemonApp._runner_failure_diagnosis( runner_name=runner_name, summary=summary, stderr_text=stderr_text, @@ -4280,6 +4345,12 @@ def _non_retryable_failure_diagnosis( return None return diagnosis + @staticmethod + def _retry_exhausted_continuation_reason(diagnosis: FailureDiagnosis) -> str: + if diagnosis.code == "codex_upstream_provider_error": + return "external_codex_upstream_provider_error" + return "runner_retry_budget_exhausted" + def _record_turn_postprocess_warning( self, *, diff --git a/src/deepscientist/diagnostics/runner_failures.py b/src/deepscientist/diagnostics/runner_failures.py index 5474c720..f2b38fde 100644 --- a/src/deepscientist/diagnostics/runner_failures.py +++ b/src/deepscientist/diagnostics/runner_failures.py @@ -26,6 +26,46 @@ class FailureDiagnosis: "unrecognized model", ) +_CODEX_PROVIDER_ACCOUNT_ERROR_MARKERS = ( + "account balance is negative", + "please recharge", + "insufficient quota", + "quota exceeded", + "billing hard limit", + "billing limit", + "payment required", + "invalid api key", + "invalid_api_key", + "incorrect api key", + "api key is invalid", + "unauthorized", +) + +_CODEX_PROVIDER_ACCOUNT_STATUS_MARKERS = ( + "401 unauthorized", + "402 payment required", + "403 forbidden", +) + +_CODEX_UPSTREAM_ERROR_MARKERS = ( + "rate limit", + "too many requests", + "service unavailable", + "bad gateway", + "gateway timeout", + "internal server error", + "temporarily unavailable", + "server overloaded", +) + +_CODEX_UPSTREAM_STATUS_MARKERS = ( + "429 too many requests", + "500 internal server error", + "502 bad gateway", + "503 service unavailable", + "504 gateway timeout", +) + def _build_haystack(*values: object) -> str: return "\n".join(str(value or "") for value in values if str(value or "").strip()) @@ -46,6 +86,52 @@ def diagnose_runner_failure( lower = haystack.lower() normalized_runner = str(runner_name or "").strip().lower() + if normalized_runner == "codex" and ( + any(marker in lower for marker in _CODEX_PROVIDER_ACCOUNT_ERROR_MARKERS) + or ( + ("unexpected status" in lower or "http_code" in lower or "status" in lower) + and any(marker in lower for marker in _CODEX_PROVIDER_ACCOUNT_STATUS_MARKERS) + ) + ): + return FailureDiagnosis( + code="codex_provider_account_error", + problem="The configured Codex provider account cannot serve the request.", + why=( + "The provider reported an account, billing, quota, or credential blocker. " + "Repeating the same quest turn will keep failing until the provider account state is corrected." + ), + guidance=( + "Check the configured provider account, quota, billing status, credentials, and API-key scope.", + "Verify the same Codex profile works outside DeepScientist before resuming the quest.", + "Do not relaunch the same quest repeatedly until the provider account state is healthy.", + ), + retriable=False, + matched_text="codex provider account error", + ) + + if normalized_runner == "codex" and ( + any(marker in lower for marker in _CODEX_UPSTREAM_ERROR_MARKERS) + or ( + ("unexpected status" in lower or "http_code" in lower or "status" in lower) + and any(marker in lower for marker in _CODEX_UPSTREAM_STATUS_MARKERS) + ) + ): + return FailureDiagnosis( + code="codex_upstream_provider_error", + problem="The configured Codex upstream provider rejected or could not serve the request.", + why=( + "This is an external provider/API service condition. DeepScientist can retry with backoff, " + "but it cannot repair upstream provider availability from inside the quest runtime." + ), + guidance=( + "Check the configured provider service health, rate limits, and API status.", + "Verify the same Codex profile works outside DeepScientist before resuming the quest.", + "Do not repeatedly relaunch the same quest if the provider continues returning the same upstream error.", + ), + retriable=True, + matched_text="codex upstream provider error", + ) + if ( "tool call result does not follow tool call (2013)" in lower or "tool result's tool id" in lower diff --git a/tests/test_daemon_api.py b/tests/test_daemon_api.py index f23d380b..d451c7e9 100644 --- a/tests/test_daemon_api.py +++ b/tests/test_daemon_api.py @@ -7369,6 +7369,147 @@ def run(self, request): ) +def test_daemon_retry_exhaustion_records_provider_diagnosis_payload(temp_home: Path) -> None: + ensure_home_layout(temp_home) + ConfigManager(temp_home).ensure_files() + app = DaemonApp(temp_home) + app.runners_config["codex"].update( + { + "retry_on_failure": True, + "retry_max_attempts": 2, + "retry_initial_backoff_sec": 0, + "retry_backoff_multiplier": 2, + "retry_max_backoff_sec": 0, + } + ) + quest = app.quest_service.create("retry exhausted provider diagnosis quest") + quest_id = quest["quest_id"] + + class TransientProviderFailRunner: + binary = "" + + def __init__(self) -> None: + self.requests = [] + + def run(self, request): + self.requests.append(request) + history_root = ensure_dir(request.quest_root / ".ds" / "codex_history" / request.run_id) + run_root = ensure_dir(request.quest_root / ".ds" / "runs" / request.run_id) + return RunResult( + ok=False, + run_id=request.run_id, + model=request.model, + output_text="unexpected status 503 Service Unavailable from upstream provider", + exit_code=1, + history_root=history_root, + run_root=run_root, + stderr_text="", + ) + + runner = TransientProviderFailRunner() + app.runners["codex"] = runner + + payload = app.handlers.chat(quest_id, {"text": "Please continue.", "source": "tui-ink"}) + assert payload["ok"] is True + + deadline = time.time() + 5 + while time.time() < deadline: + snapshot = app.quest_service.snapshot(quest_id) + events = read_jsonl(Path(quest["quest_root"]) / ".ds" / "events.jsonl") + if ( + any(item.get("type") == "runner.turn_error" for item in events) + and snapshot.get("retry_state") is None + and str(snapshot.get("display_status") or "").strip() == "error" + ): + break + time.sleep(0.05) + else: + raise AssertionError("provider failure did not settle after retry budget exhaustion") + + snapshot = app.quest_service.snapshot(quest_id) + events = read_jsonl(Path(quest["quest_root"]) / ".ds" / "events.jsonl") + retry_exhausted = [item for item in events if item.get("type") == "runner.turn_retry_exhausted"] + turn_errors = [item for item in events if item.get("type") == "runner.turn_error"] + + assert len(runner.requests) == 2 + assert snapshot["continuation_policy"] == "wait_for_user_or_resume" + assert snapshot["continuation_reason"] == "external_codex_upstream_provider_error" + assert retry_exhausted + diagnosis = retry_exhausted[-1].get("diagnosis") + assert diagnosis["code"] == "codex_upstream_provider_error" + assert diagnosis["problem"] + assert diagnosis["why"] + assert any("provider" in line.lower() for line in diagnosis["fix"]) + assert retry_exhausted[-1]["diagnosis_code"] == "codex_upstream_provider_error" + assert turn_errors[-1]["diagnosis_code"] == "codex_upstream_provider_error" + + +def test_daemon_stops_retry_for_provider_account_blocker(temp_home: Path) -> None: + ensure_home_layout(temp_home) + ConfigManager(temp_home).ensure_files() + app = DaemonApp(temp_home) + app.runners_config["codex"].update( + { + "retry_on_failure": True, + "retry_max_attempts": 5, + "retry_initial_backoff_sec": 0, + "retry_backoff_multiplier": 2, + "retry_max_backoff_sec": 0, + } + ) + quest = app.quest_service.create("provider account blocker quest") + quest_id = quest["quest_id"] + + class AccountBlockerRunner: + binary = "" + + def __init__(self) -> None: + self.requests = [] + + def run(self, request): + self.requests.append(request) + history_root = ensure_dir(request.quest_root / ".ds" / "codex_history" / request.run_id) + run_root = ensure_dir(request.quest_root / ".ds" / "runs" / request.run_id) + return RunResult( + ok=False, + run_id=request.run_id, + model=request.model, + output_text="unexpected status 403 Forbidden: account balance is negative, please recharge first", + exit_code=1, + history_root=history_root, + run_root=run_root, + stderr_text="", + ) + + runner = AccountBlockerRunner() + app.runners["codex"] = runner + + payload = app.handlers.chat(quest_id, {"text": "Please continue.", "source": "tui-ink"}) + assert payload["ok"] is True + + deadline = time.time() + 5 + while time.time() < deadline: + snapshot = app.quest_service.snapshot(quest_id) + events = read_jsonl(Path(quest["quest_root"]) / ".ds" / "events.jsonl") + if any(item.get("type") == "runner.turn_error" for item in events): + if snapshot.get("retry_state") is None and str(snapshot.get("display_status") or "").strip() == "error": + break + time.sleep(0.05) + else: + raise AssertionError("provider account blocker did not settle into an immediate error state") + + snapshot = app.quest_service.snapshot(quest_id) + events = read_jsonl(Path(quest["quest_root"]) / ".ds" / "events.jsonl") + turn_errors = [item for item in events if item.get("type") == "runner.turn_error"] + + assert len(runner.requests) == 1 + assert snapshot["retry_state"] is None + assert snapshot["continuation_policy"] == "wait_for_user_or_resume" + assert snapshot["continuation_reason"] == "non_retryable_runner_error" + assert not any(item.get("type") == "runner.turn_retry_scheduled" for item in events) + assert turn_errors[-1]["diagnosis_code"] == "codex_provider_account_error" + + def test_daemon_skips_retry_for_non_retryable_minimax_protocol_error(temp_home: Path) -> None: ensure_home_layout(temp_home) ConfigManager(temp_home).ensure_files() diff --git a/tests/test_doctor.py b/tests/test_doctor.py index 0f577c37..dd31c536 100644 --- a/tests/test_doctor.py +++ b/tests/test_doctor.py @@ -13,6 +13,55 @@ from deepscientist.shared import append_jsonl, ensure_dir, utc_now, write_json +def _stub_ready_doctor_environment(monkeypatch) -> None: # type: ignore[no-untyped-def] + monkeypatch.setattr("deepscientist.doctor.resolve_runner_binary", lambda binary, runner_name=None: "/usr/bin/codex") + monkeypatch.setattr("deepscientist.doctor._query_local_health", lambda url: None) + monkeypatch.setattr("deepscientist.doctor._port_is_bindable", lambda host, port: (True, None)) + monkeypatch.setattr( + "deepscientist.doctor._check_bundles", + lambda root: { + "id": "bundles", + "label": "UI bundles", + "ok": True, + "status": "ok", + "summary": "Web and TUI bundles are present.", + "warnings": [], + "errors": [], + "guidance": [], + "details": {}, + }, + ) + monkeypatch.setattr("deepscientist.doctor.which", lambda name: "/usr/bin/uv" if name == "uv" else None) + monkeypatch.setattr( + "deepscientist.doctor.subprocess.run", + lambda *args, **kwargs: SimpleNamespace(returncode=0, stdout="uv 0.9.2\n", stderr=""), + ) + monkeypatch.setattr( + ConfigManager, + "git_readiness", + lambda self: { + "ok": True, + "installed": True, + "user_name": "Deep Scientist", + "user_email": "deep@example.com", + "warnings": [], + "errors": [], + "guidance": [], + }, + ) + monkeypatch.setattr( + ConfigManager, + "probe_codex_bootstrap", + lambda self, *, persist=False, payload=None: { + "ok": True, + "summary": "Codex startup probe completed.", + "warnings": [], + "errors": [], + "guidance": [], + }, + ) + + def test_cli_parser_exposes_doctor_and_removes_metrics() -> None: parser = build_parser() @@ -270,6 +319,54 @@ def test_doctor_reports_recent_runtime_failure_with_problem_why_fix(monkeypatch, assert f"evidence: quest: {quest['quest_id']}" in rendered +def test_doctor_reports_provider_account_runtime_failure_with_problem_why_fix(monkeypatch, temp_home: Path) -> None: + ensure_home_layout(temp_home) + manager = ConfigManager(temp_home) + manager.ensure_files() + quest = QuestService(temp_home).create("doctor provider account diagnosis quest") + quest_root = Path(quest["quest_root"]) + run_id = "run-provider-account-001" + run_root = ensure_dir(quest_root / ".ds" / "runs" / run_id) + write_json( + run_root / "result.json", + { + "ok": False, + "run_id": run_id, + "model": "provider-default", + "exit_code": 1, + "output_text": "unexpected status 402 Payment Required: insufficient quota", + "stderr_text": "", + "completed_at": utc_now(), + }, + ) + append_jsonl( + quest_root / ".ds" / "events.jsonl", + { + "event_id": "evt-provider-account-001", + "type": "runner.turn_error", + "quest_id": quest["quest_id"], + "run_id": run_id, + "source": "codex", + "skill_id": "baseline", + "model": "provider-default", + "summary": "Runner failed after provider rejected the account quota.", + "created_at": utc_now(), + }, + ) + _stub_ready_doctor_environment(monkeypatch) + + report = run_doctor(temp_home, repo_root=repo_root()) + rendered = render_doctor_report(report) + runtime_check = next(item for item in report["checks"] if item["id"] == "recent_runtime_failures") + + assert runtime_check["status"] == "warn" + assert runtime_check["problem"] == "The configured Codex provider account cannot serve the request." + assert "account, billing, quota" in str(runtime_check["why"]) + assert any("quota" in line.lower() for line in runtime_check["fix"]) + assert "problem: The configured Codex provider account cannot serve the request." in rendered + assert "fix: Check the configured provider account" in rendered + + def test_doctor_surfaces_probe_diagnosis_for_known_tool_argument_error(monkeypatch, temp_home: Path) -> None: ensure_home_layout(temp_home) manager = ConfigManager(temp_home) diff --git a/tests/test_runner_failure_diagnostics.py b/tests/test_runner_failure_diagnostics.py new file mode 100644 index 00000000..5712dd86 --- /dev/null +++ b/tests/test_runner_failure_diagnostics.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +import pytest + +from deepscientist.diagnostics import diagnose_runner_failure + + +@pytest.mark.parametrize( + "message", + [ + "unexpected status 403 Forbidden: account balance is negative, please recharge first", + "unexpected status 402 Payment Required: insufficient quota", + '{"error":{"message":"billing hard limit has been reached","http_code":"403"}}', + "401 Unauthorized: invalid api key for the configured provider account", + ], +) +def test_codex_provider_account_errors_are_non_retryable_blockers(message: str) -> None: + diagnosis = diagnose_runner_failure(runner_name="codex", output_text=message) + + assert diagnosis is not None + assert diagnosis.code == "codex_provider_account_error" + assert diagnosis.retriable is False + assert "account" in diagnosis.problem.lower() + + +@pytest.mark.parametrize( + "message", + [ + "unexpected status 429 Too Many Requests: rate limit exceeded", + "unexpected status 503 Service Unavailable", + '{"error":{"message":"502 Bad Gateway from upstream provider","http_code":"502"}}', + "504 gateway timeout from upstream model provider", + ], +) +def test_codex_upstream_provider_errors_are_retryable_external_blockers(message: str) -> None: + diagnosis = diagnose_runner_failure(runner_name="codex", output_text=message) + + assert diagnosis is not None + assert diagnosis.code == "codex_upstream_provider_error" + assert diagnosis.retriable is True + + +def test_codex_bad_request_protocol_errors_stay_non_retryable_local_diagnostics() -> None: + diagnosis = diagnose_runner_failure( + runner_name="codex", + stderr_text='{"type":"error","error":{"type":"bad_request_error","message":"invalid params, tool call result does not follow tool call (2013)","http_code":"400"}}', + ) + + assert diagnosis is not None + assert diagnosis.code == "minimax_tool_result_sequence_error" + assert diagnosis.retriable is False