Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 72 additions & 1 deletion src/deepscientist/daemon/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -3402,6 +3402,19 @@ def _run_quest_turn(self, quest_id: str) -> None:
)
return
exhausted_summary = f"{failure_summary} Retry budget exhausted after {attempt_index} attempt(s)."
diagnosis = self._runner_failure_diagnosis(
runner_name=runner_name,
summary=exhausted_summary,
stderr_text=str(exc),
output_text="",
)
if diagnosis is not None and diagnosis.retriable:
self.quest_service.update_runtime_state(
quest_root=quest_root,
continuation_policy="wait_for_user_or_resume",
continuation_reason=self._retry_exhausted_continuation_reason(diagnosis),
continuation_updated_at=utc_now(),
)
self._append_retry_event(
quest_id,
event_type="runner.turn_retry_exhausted",
Expand All @@ -3414,6 +3427,7 @@ def _run_quest_turn(self, quest_id: str) -> None:
max_attempts=max_attempts,
summary=exhausted_summary,
failure_summary=failure_summary,
diagnosis=diagnosis,
)
self._record_turn_error(
quest_id=quest_id,
Expand All @@ -3423,6 +3437,8 @@ def _run_quest_turn(self, quest_id: str) -> None:
model=model,
summary=exhausted_summary,
retry_state=None,
diagnosis_code=diagnosis.code if diagnosis is not None else None,
guidance=list(diagnosis.guidance) if diagnosis is not None else None,
)
return

Expand Down Expand Up @@ -3576,6 +3592,19 @@ def _run_quest_turn(self, quest_id: str) -> None:
return

exhausted_summary = f"{failure_summary} Retry budget exhausted after {attempt_index} attempt(s)."
diagnosis = self._runner_failure_diagnosis(
runner_name=runner_name,
summary=exhausted_summary,
stderr_text=result.stderr_text,
output_text=result.output_text,
)
if diagnosis is not None and diagnosis.retriable:
self.quest_service.update_runtime_state(
quest_root=quest_root,
continuation_policy="wait_for_user_or_resume",
continuation_reason=self._retry_exhausted_continuation_reason(diagnosis),
continuation_updated_at=utc_now(),
)
self._append_retry_event(
quest_id,
event_type="runner.turn_retry_exhausted",
Expand All @@ -3588,6 +3617,7 @@ def _run_quest_turn(self, quest_id: str) -> None:
max_attempts=max_attempts,
summary=exhausted_summary,
failure_summary=failure_summary,
diagnosis=diagnosis,
)
self._record_turn_error(
quest_id=quest_id,
Expand All @@ -3597,6 +3627,8 @@ def _run_quest_turn(self, quest_id: str) -> None:
model=model,
summary=exhausted_summary,
retry_state=None,
diagnosis_code=diagnosis.code if diagnosis is not None else None,
guidance=list(diagnosis.guidance) if diagnosis is not None else None,
)
return
finally:
Expand Down Expand Up @@ -4035,6 +4067,7 @@ def _append_retry_event(
backoff_seconds: float | None = None,
next_attempt_index: int | None = None,
previous_run_id: str | None = None,
diagnosis: FailureDiagnosis | None = None,
) -> dict[str, Any]:
payload = {
"event_id": generate_id("evt"),
Expand All @@ -4058,6 +4091,9 @@ def _append_retry_event(
payload["next_attempt_index"] = next_attempt_index
if previous_run_id:
payload["previous_run_id"] = previous_run_id
if diagnosis is not None:
payload["diagnosis_code"] = diagnosis.code
payload["diagnosis"] = self._failure_diagnosis_payload(diagnosis)
append_jsonl(self.home / "quests" / quest_id / ".ds" / "events.jsonl", payload)
self.logger.log(
"warning" if "scheduled" in event_type or "exhausted" in event_type else "info",
Expand All @@ -4071,6 +4107,7 @@ def _append_retry_event(
backoff_seconds=backoff_seconds,
next_attempt_index=next_attempt_index,
previous_run_id=previous_run_id,
diagnosis_code=diagnosis.code if diagnosis is not None else None,
)
return payload

Expand Down Expand Up @@ -4262,6 +4299,34 @@ def _record_turn_error(
],
)

@staticmethod
def _failure_diagnosis_payload(diagnosis: FailureDiagnosis) -> dict[str, Any]:
fix = [str(line) for line in diagnosis.guidance if str(line).strip()]
return {
"code": diagnosis.code,
"problem": diagnosis.problem,
"why": diagnosis.why,
"fix": fix,
"guidance": fix,
"retriable": bool(diagnosis.retriable),
"matched_text": diagnosis.matched_text,
}

@staticmethod
def _runner_failure_diagnosis(
*,
runner_name: str,
summary: str,
stderr_text: str,
output_text: str,
) -> FailureDiagnosis | None:
return diagnose_runner_failure(
runner_name=runner_name,
summary=summary,
stderr_text=stderr_text,
output_text=output_text,
)

@staticmethod
def _non_retryable_failure_diagnosis(
*,
Expand All @@ -4270,7 +4335,7 @@ def _non_retryable_failure_diagnosis(
stderr_text: str,
output_text: str,
) -> FailureDiagnosis | None:
diagnosis = diagnose_runner_failure(
diagnosis = DaemonApp._runner_failure_diagnosis(
runner_name=runner_name,
summary=summary,
stderr_text=stderr_text,
Expand All @@ -4280,6 +4345,12 @@ def _non_retryable_failure_diagnosis(
return None
return diagnosis

@staticmethod
def _retry_exhausted_continuation_reason(diagnosis: FailureDiagnosis) -> str:
if diagnosis.code == "codex_upstream_provider_error":
return "external_codex_upstream_provider_error"
return "runner_retry_budget_exhausted"

def _record_turn_postprocess_warning(
self,
*,
Expand Down
86 changes: 86 additions & 0 deletions src/deepscientist/diagnostics/runner_failures.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,46 @@ class FailureDiagnosis:
"unrecognized model",
)

_CODEX_PROVIDER_ACCOUNT_ERROR_MARKERS = (
"account balance is negative",
"please recharge",
"insufficient quota",
"quota exceeded",
"billing hard limit",
"billing limit",
"payment required",
"invalid api key",
"invalid_api_key",
"incorrect api key",
"api key is invalid",
"unauthorized",
)

_CODEX_PROVIDER_ACCOUNT_STATUS_MARKERS = (
"401 unauthorized",
"402 payment required",
"403 forbidden",
)

_CODEX_UPSTREAM_ERROR_MARKERS = (
"rate limit",
"too many requests",
"service unavailable",
"bad gateway",
"gateway timeout",
"internal server error",
"temporarily unavailable",
"server overloaded",
)

_CODEX_UPSTREAM_STATUS_MARKERS = (
"429 too many requests",
"500 internal server error",
"502 bad gateway",
"503 service unavailable",
"504 gateway timeout",
)


def _build_haystack(*values: object) -> str:
return "\n".join(str(value or "") for value in values if str(value or "").strip())
Expand All @@ -46,6 +86,52 @@ def diagnose_runner_failure(
lower = haystack.lower()
normalized_runner = str(runner_name or "").strip().lower()

if normalized_runner == "codex" and (
any(marker in lower for marker in _CODEX_PROVIDER_ACCOUNT_ERROR_MARKERS)
or (
("unexpected status" in lower or "http_code" in lower or "status" in lower)
and any(marker in lower for marker in _CODEX_PROVIDER_ACCOUNT_STATUS_MARKERS)
)
):
return FailureDiagnosis(
code="codex_provider_account_error",
problem="The configured Codex provider account cannot serve the request.",
why=(
"The provider reported an account, billing, quota, or credential blocker. "
"Repeating the same quest turn will keep failing until the provider account state is corrected."
),
guidance=(
"Check the configured provider account, quota, billing status, credentials, and API-key scope.",
"Verify the same Codex profile works outside DeepScientist before resuming the quest.",
"Do not relaunch the same quest repeatedly until the provider account state is healthy.",
),
retriable=False,
matched_text="codex provider account error",
)

if normalized_runner == "codex" and (
any(marker in lower for marker in _CODEX_UPSTREAM_ERROR_MARKERS)
or (
("unexpected status" in lower or "http_code" in lower or "status" in lower)
and any(marker in lower for marker in _CODEX_UPSTREAM_STATUS_MARKERS)
)
):
return FailureDiagnosis(
code="codex_upstream_provider_error",
problem="The configured Codex upstream provider rejected or could not serve the request.",
why=(
"This is an external provider/API service condition. DeepScientist can retry with backoff, "
"but it cannot repair upstream provider availability from inside the quest runtime."
),
guidance=(
"Check the configured provider service health, rate limits, and API status.",
"Verify the same Codex profile works outside DeepScientist before resuming the quest.",
"Do not repeatedly relaunch the same quest if the provider continues returning the same upstream error.",
),
retriable=True,
matched_text="codex upstream provider error",
)

if (
"tool call result does not follow tool call (2013)" in lower
or "tool result's tool id" in lower
Expand Down
Loading