From 0672a981522f606d7f8ba52cf7a9fb5479b7dbd9 Mon Sep 17 00:00:00 2001 From: Ryan Breen Date: Thu, 16 Apr 2026 05:11:02 -0400 Subject: [PATCH 1/2] diagnostic(arm64): isr_unblock_for_io internal ring sites Co-authored-by: Ryan Breen Co-authored-by: Claude Code --- kernel/src/drivers/ahci/mod.rs | 14 ++++++ kernel/src/task/scheduler.rs | 89 ++++++++++++++++++++++++++++++++++ 2 files changed, 103 insertions(+) diff --git a/kernel/src/drivers/ahci/mod.rs b/kernel/src/drivers/ahci/mod.rs index 80b88055..4eef318e 100644 --- a/kernel/src/drivers/ahci/mod.rs +++ b/kernel/src/drivers/ahci/mod.rs @@ -263,6 +263,13 @@ pub(crate) const AHCI_TRACE_BEFORE_COMPLETE: u32 = 4; pub(crate) const AHCI_TRACE_AFTER_COMPLETE: u32 = 5; pub(crate) const AHCI_TRACE_WAKE_ENTER: u32 = 6; pub(crate) const AHCI_TRACE_WAKE_EXIT: u32 = 7; +pub(crate) const AHCI_TRACE_UNBLOCK_ENTRY: u32 = 8; +pub(crate) const AHCI_TRACE_UNBLOCK_AFTER_CPU: u32 = 9; +pub(crate) const AHCI_TRACE_UNBLOCK_AFTER_BUFFER: u32 = 10; +pub(crate) const AHCI_TRACE_UNBLOCK_AFTER_NEED_RESCHED: u32 = 11; +pub(crate) const AHCI_TRACE_UNBLOCK_BEFORE_SGI_SCAN: u32 = 12; +pub(crate) const AHCI_TRACE_UNBLOCK_PER_SGI: u32 = 13; +pub(crate) const AHCI_TRACE_UNBLOCK_EXIT: u32 = 14; struct AhciTraceSlot { site: AtomicU32, @@ -405,6 +412,13 @@ fn ahci_trace_site_name(site: u32) -> &'static str { AHCI_TRACE_AFTER_COMPLETE => "AFTER_COMPLETE", AHCI_TRACE_WAKE_ENTER => "WAKE_ENTER", AHCI_TRACE_WAKE_EXIT => "WAKE_EXIT", + AHCI_TRACE_UNBLOCK_ENTRY => "UNBLOCK_ENTRY", + AHCI_TRACE_UNBLOCK_AFTER_CPU => "UNBLOCK_AFTER_CPU", + AHCI_TRACE_UNBLOCK_AFTER_BUFFER => "UNBLOCK_AFTER_BUFFER", + AHCI_TRACE_UNBLOCK_AFTER_NEED_RESCHED => "UNBLOCK_AFTER_NEED_RESCHED", + AHCI_TRACE_UNBLOCK_BEFORE_SGI_SCAN => "UNBLOCK_BEFORE_SGI_SCAN", + AHCI_TRACE_UNBLOCK_PER_SGI => "UNBLOCK_PER_SGI", + AHCI_TRACE_UNBLOCK_EXIT => "UNBLOCK_EXIT", _ => "UNKNOWN", } } diff --git a/kernel/src/task/scheduler.rs b/kernel/src/task/scheduler.rs index 080431cb..b7729bdc 100644 --- a/kernel/src/task/scheduler.rs +++ b/kernel/src/task/scheduler.rs @@ -2527,17 +2527,93 @@ pub fn is_need_resched() -> bool { /// The scheduler drains the buffer under its own lock at the top of every /// `schedule_deferred_requeue()` / `schedule()` call. pub fn isr_unblock_for_io(tid: u64) { + #[cfg(target_arch = "aarch64")] + crate::drivers::ahci::push_ahci_event( + crate::drivers::ahci::AHCI_TRACE_UNBLOCK_ENTRY, + 0, + 0, + 0, + 0, + 0, + tid, + 0, + 0, + false, + ); let cpu = current_cpu_id_raw(); + #[cfg(target_arch = "aarch64")] + crate::drivers::ahci::push_ahci_event( + crate::drivers::ahci::AHCI_TRACE_UNBLOCK_AFTER_CPU, + 0, + 0, + 0, + 0, + 0, + tid, + 0, + 0, + false, + ); if cpu < ISR_WAKEUP_BUFFERS.len() { ISR_WAKEUP_BUFFERS[cpu].push(tid); } + #[cfg(target_arch = "aarch64")] + crate::drivers::ahci::push_ahci_event( + crate::drivers::ahci::AHCI_TRACE_UNBLOCK_AFTER_BUFFER, + 0, + 0, + 0, + 0, + 0, + tid, + 0, + 0, + false, + ); set_need_resched(); + #[cfg(target_arch = "aarch64")] + crate::drivers::ahci::push_ahci_event( + crate::drivers::ahci::AHCI_TRACE_UNBLOCK_AFTER_NEED_RESCHED, + 0, + 0, + 0, + 0, + 0, + tid, + 0, + 0, + false, + ); // Send IPI to idle CPUs so the buffer is drained promptly. #[cfg(target_arch = "aarch64")] { let online = crate::arch_impl::aarch64::smp::cpus_online() as usize; + crate::drivers::ahci::push_ahci_event( + crate::drivers::ahci::AHCI_TRACE_UNBLOCK_BEFORE_SGI_SCAN, + 0, + 0, + 0, + 0, + 0, + tid, + 0, + 0, + false, + ); for target in 0..online.min(MAX_CPUS) { if target != cpu && is_cpu_idle(target) { + crate::drivers::ahci::push_ahci_event( + crate::drivers::ahci::AHCI_TRACE_UNBLOCK_PER_SGI, + 0, + 0, + 0, + 0, + 0, + tid, + target as u32, + 0, + false, + ); crate::arch_impl::aarch64::gic::send_sgi( crate::arch_impl::aarch64::constants::SGI_RESCHEDULE as u8, target as u8, @@ -2545,6 +2621,19 @@ pub fn isr_unblock_for_io(tid: u64) { } } } + #[cfg(target_arch = "aarch64")] + crate::drivers::ahci::push_ahci_event( + crate::drivers::ahci::AHCI_TRACE_UNBLOCK_EXIT, + 0, + 0, + 0, + 0, + 0, + tid, + 0, + 0, + false, + ); } /// Read the current CPU ID directly from hardware (MPIDR_EL1 on ARM64). From f34e66d5e7f4bbfeac724f19ab61efd420d6d9e2 Mon Sep 17 00:00:00 2001 From: Ryan Breen Date: Thu, 16 Apr 2026 05:26:22 -0400 Subject: [PATCH 2/2] investigation(arm64): F10 isr_unblock_for_io breadcrumb sweep Co-authored-by: Ryan Breen Co-authored-by: Claude Code --- .../decisions.md | 9 + .../exit.md | 226 ++++++++++++++++++ .../scratchpad.md | 53 ++++ docs/planning/ARM64_CPU0_SMP_INVESTIGATION.md | 195 +++++++++++++++ 4 files changed, 483 insertions(+) create mode 100644 .factory-runs/arm64-f10-isr-unblock-boundary-20260416-050814/decisions.md create mode 100644 .factory-runs/arm64-f10-isr-unblock-boundary-20260416-050814/exit.md create mode 100644 .factory-runs/arm64-f10-isr-unblock-boundary-20260416-050814/scratchpad.md diff --git a/.factory-runs/arm64-f10-isr-unblock-boundary-20260416-050814/decisions.md b/.factory-runs/arm64-f10-isr-unblock-boundary-20260416-050814/decisions.md new file mode 100644 index 00000000..1ca91ad2 --- /dev/null +++ b/.factory-runs/arm64-f10-isr-unblock-boundary-20260416-050814/decisions.md @@ -0,0 +1,9 @@ +# F10 ISR Unblock Boundary Decisions + +## 2026-04-16 05:08 - Encode per-SGI target CPU in `slot_mask` + +The existing AHCI ring schema already has `slot_mask`, and the new +`UNBLOCK_PER_SGI` event has no AHCI slot mask semantics. Reusing `slot_mask` for +the target CPU keeps the ring struct unchanged and avoids broadening the +diagnostic patch. For all other new `UNBLOCK_*` sites, `slot_mask` remains `0`. + diff --git a/.factory-runs/arm64-f10-isr-unblock-boundary-20260416-050814/exit.md b/.factory-runs/arm64-f10-isr-unblock-boundary-20260416-050814/exit.md new file mode 100644 index 00000000..47c5a0ef --- /dev/null +++ b/.factory-runs/arm64-f10-isr-unblock-boundary-20260416-050814/exit.md @@ -0,0 +1,226 @@ +# F10 ISR Unblock Boundary Diagnostic Exit + +## What I built + +- `kernel/src/drivers/ahci/mod.rs`: added AHCI ring site tags + `UNBLOCK_ENTRY`, `UNBLOCK_AFTER_CPU`, `UNBLOCK_AFTER_BUFFER`, + `UNBLOCK_AFTER_NEED_RESCHED`, `UNBLOCK_BEFORE_SGI_SCAN`, + `UNBLOCK_PER_SGI`, and `UNBLOCK_EXIT`; extended site-name rendering. +- `kernel/src/task/scheduler.rs`: added seven ring-only breadcrumbs inside + `isr_unblock_for_io(tid)`. +- `docs/planning/ARM64_CPU0_SMP_INVESTIGATION.md`: appended the F10 sweep, + verbatim stalled-token extract, last-site verdict, candidate ranking, and + F11 recommendation. +- `.factory-runs/arm64-f10-isr-unblock-boundary-20260416-050814/scratchpad.md`: + maintained the run notebook. +- `.factory-runs/arm64-f10-isr-unblock-boundary-20260416-050814/decisions.md`: + documented the choice to encode per-SGI target CPU in `slot_mask`. + +## What the original ask was + +Branch from `diagnostic/f9-completion-boundary`, add internal AHCI-ring +breadcrumbs to `scheduler::isr_unblock_for_io(tid)` without changing scheduler +semantics, run a five-run Parallels sweep, and use the new `UNBLOCK_*` sequence +to identify where the stuck wake call stopped. + +## How what I built meets that ask + +- **Implemented**: branch `diagnostic/f10-isr-unblock-boundary` was created + from `diagnostic/f9-completion-boundary`. +- **Implemented**: new AHCI site constants and display names were added in + `kernel/src/drivers/ahci/mod.rs:266` and `kernel/src/drivers/ahci/mod.rs:415`. +- **Implemented**: `isr_unblock_for_io(tid)` breadcrumbs were added in + `kernel/src/task/scheduler.rs:2529`. +- **Implemented**: `UNBLOCK_PER_SGI` encodes target CPU in `slot_mask`, recorded + in `decisions.md`. +- **Implemented**: aarch64 build completed with zero warnings/errors. +- **Implemented**: 5x `./run.sh --parallels --test 60` artifacts were captured + under `logs/breenix-parallels-cpu0/f10-isr-unblock/run{1..5}/`. +- **Implemented**: each `summary.txt` contains all F9 fields plus the seven + requested `UNBLOCK_*` counts. +- **Implemented**: investigation doc appended at + `docs/planning/ARM64_CPU0_SMP_INVESTIGATION.md:2364`. + +## What I did NOT build + +- I did not change scheduler, SGI, or wake-buffer semantics. +- I did not change F7 admission, F8 ring design, or F9 completion-boundary code. +- I did not touch Tier-1 or Tier-2 prohibited files. +- I did not add logging macros, locks, heap allocation, or new trace-ring + fields. + +## Known risks and gaps + +- All five `run.sh` invocations returned `exit_status=1`, consistent with F9's + headless screenshot/test helper behavior; serial logs were captured and used + for diagnosis. +- The F10 sweep exposed two internal signatures: runs 1 and 3 stop after + `UNBLOCK_PER_SGI`, while run 2 stops after `UNBLOCK_AFTER_CPU`. The primary + F11 direction is SGI delivery; wake-buffer push remains a secondary candidate. +- Repo-wide `cargo fmt --check` fails on pre-existing unrelated formatting and + trailing-whitespace issues, so no broad formatter was applied. +- Beads tracking could not be created because `bd create` failed against the + local Dolt runtime server (`database "breenix" not found on Dolt server at + 127.0.0.1:63842`). + +## Sweep summaries + +```text +run1: +exit_status=1 +ahci_timeouts=14 +ahci_ring_entries=64 +ahci_port0_is=2 +ahci_port1_is=2 +bsshd_started=1 +before_complete=10 +after_complete=8 +wake_enter=2 +wake_exit=2 +unblock_entry=2 +unblock_after_cpu=2 +unblock_after_buffer=2 +unblock_after_need_resched=2 +unblock_before_sgi_scan=2 +unblock_per_sgi=6 +unblock_exit=2 + +run2: +exit_status=1 +ahci_timeouts=74 +ahci_ring_entries=64 +ahci_port0_is=2 +ahci_port1_is=2 +bsshd_started=1 +before_complete=10 +after_complete=8 +wake_enter=2 +wake_exit=2 +unblock_entry=2 +unblock_after_cpu=2 +unblock_after_buffer=0 +unblock_after_need_resched=0 +unblock_before_sgi_scan=0 +unblock_per_sgi=12 +unblock_exit=2 + +run3: +exit_status=1 +ahci_timeouts=1227 +ahci_ring_entries=31 +ahci_port0_is=1 +ahci_port1_is=1 +bsshd_started=1 +before_complete=4 +after_complete=4 +wake_enter=1 +wake_exit=1 +unblock_entry=1 +unblock_after_cpu=1 +unblock_after_buffer=1 +unblock_after_need_resched=1 +unblock_before_sgi_scan=1 +unblock_per_sgi=3 +unblock_exit=1 + +run4: +exit_status=1 +ahci_timeouts=0 +ahci_ring_entries=0 +ahci_port0_is=0 +ahci_port1_is=0 +bsshd_started=1 +before_complete=0 +after_complete=0 +wake_enter=0 +wake_exit=0 +unblock_entry=0 +unblock_after_cpu=0 +unblock_after_buffer=0 +unblock_after_need_resched=0 +unblock_before_sgi_scan=0 +unblock_per_sgi=0 +unblock_exit=0 + +run5: +exit_status=1 +ahci_timeouts=670 +ahci_ring_entries=64 +ahci_port0_is=2 +ahci_port1_is=2 +bsshd_started=1 +before_complete=6 +after_complete=8 +wake_enter=2 +wake_exit=2 +unblock_entry=2 +unblock_after_cpu=2 +unblock_after_buffer=2 +unblock_after_need_resched=2 +unblock_before_sgi_scan=2 +unblock_per_sgi=12 +unblock_exit=2 +``` + +## Last-site verdict + +Primary sample: `run1`, token `1212`, waiter `tid=11`. The last emitted site is +`UNBLOCK_PER_SGI`, with `slot_mask=0x2` encoding target CPU 2. That event is +emitted immediately before `gic::send_sgi()`, and there is no later +`UNBLOCK_EXIT`, `WAKE_EXIT`, `AFTER_COMPLETE`, or `RETURN` for token `1212`. + +Corroborating sample: `run3`, token `1396`, waiter `tid=13`, last emitted site +`UNBLOCK_PER_SGI`, with `slot_mask=0x1` encoding target CPU 1. + +Alternate sample: `run2`, token `1263`, waiter `tid=13`, last emitted site +`UNBLOCK_AFTER_CPU`, with no `UNBLOCK_AFTER_BUFFER` for that stuck call. + +## Verbatim extract + +```text +[ahci] Port 1 TIMEOUT (5s): CI=0x0 IS=0x1 TFD=0x40 HBA_IS=0x2 +[ahci] GIC: SPI34 pend=true act=true DAIF=0x300 pend_snap=[0x800004,0x0,0x0] +[ahci] isr_count=1210 cmd#=1214 completion_done=0 PMR=0xf8 RPR=0xff +[ahci] port_isr_hits=[1,1209] complete_hits=[0,3] +[ahci] isr_last_pmr=0xf8 last_port1_IS=0x1 last_port1_cmd_num=1212 +[ahci] read_block(522) wait failed: AHCI: command timeout +[STUCK_SPI34] cpu=3 gic_version=3 +[STUCK_SPI34] GICD_ISPENDR[1]=0x800004 bit=2 pending=true +[STUCK_SPI34] GICD_ISACTIVER[1]=0x4 bit=2 active=true +[STUCK_SPI34] ICC_RPR_EL1=0xff +[STUCK_SPI34] ICC_PMR_EL1=0xf8 +[STUCK_SPI34] DAIF=0x300 +[STUCK_SPI34] AHCI_PORT0_IS=0x0 +[STUCK_SPI34] AHCI_PORT1_IS=0x1 +[AHCI_RING] nsec=2379603791 site=UNBLOCK_PER_SGI cpu=0 port=0 IS=0x0 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x2 token=0 wake_success=0 seq=3677 +[AHCI_RING] nsec=2379602916 site=UNBLOCK_BEFORE_SGI_SCAN cpu=0 port=0 IS=0x0 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x0 token=0 wake_success=0 seq=3676 +[AHCI_RING] nsec=2379602041 site=UNBLOCK_AFTER_NEED_RESCHED cpu=0 port=0 IS=0x0 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x0 token=0 wake_success=0 seq=3675 +[AHCI_RING] nsec=2379601166 site=UNBLOCK_AFTER_BUFFER cpu=0 port=0 IS=0x0 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x0 token=0 wake_success=0 seq=3674 +[AHCI_RING] nsec=2379600291 site=UNBLOCK_AFTER_CPU cpu=0 port=0 IS=0x0 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x0 token=0 wake_success=0 seq=3673 +[AHCI_RING] nsec=2379599416 site=UNBLOCK_ENTRY cpu=0 port=0 IS=0x0 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x0 token=0 wake_success=0 seq=3672 +[AHCI_RING] nsec=2379598541 site=WAKE_ENTER cpu=0 port=0 IS=0x0 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x0 token=1212 wake_success=0 seq=3671 +[AHCI_RING] nsec=2379597666 site=BEFORE_COMPLETE cpu=0 port=1 IS=0x0 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x1 token=1212 wake_success=0 seq=3670 +[AHCI_RING] nsec=2379596750 site=POST_CLEAR cpu=0 port=1 IS=0x0 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x1 token=1212 wake_success=0 seq=3669 +[AHCI_RING] nsec=2379589250 site=ENTER cpu=0 port=1 IS=0x1 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x0 token=1212 wake_success=0 seq=3668 +``` + +## F11 recommendation + +F11 should audit SGI delivery first: add ring-only breadcrumbs inside +`gic::send_sgi()` and SGI target construction, before and after each +architectural/system-register write, preserving the target CPU in the event. + +Keep a small secondary guard around `IsrWakeupBuffer::push()` so the `run2` +`UNBLOCK_AFTER_CPU` signature can be confirmed or eliminated without changing +scheduler behavior. + +## How to verify + +```bash +git log --oneline --max-count=2 +cargo build --release --target aarch64-breenix.json -Z build-std=core,alloc -Z build-std-features=compiler-builtins-mem -p kernel --bin kernel-aarch64 2>&1 | tee /tmp/f10-aarch64-build.log +grep -E '^(warning|error)' /tmp/f10-aarch64-build.log +git diff --check +for i in 1 2 3 4 5; do cat logs/breenix-parallels-cpu0/f10-isr-unblock/run$i/summary.txt; done +rg -n "site=(UNBLOCK_|WAKE_|BEFORE_COMPLETE|AFTER_COMPLETE|ENTER|POST_CLEAR|RETURN)" logs/breenix-parallels-cpu0/f10-isr-unblock/run1/serial.log | tail -40 +``` diff --git a/.factory-runs/arm64-f10-isr-unblock-boundary-20260416-050814/scratchpad.md b/.factory-runs/arm64-f10-isr-unblock-boundary-20260416-050814/scratchpad.md new file mode 100644 index 00000000..47b5733f --- /dev/null +++ b/.factory-runs/arm64-f10-isr-unblock-boundary-20260416-050814/scratchpad.md @@ -0,0 +1,53 @@ +# F10 ISR Unblock Boundary Scratchpad + +## 2026-04-16 05:08 + +Starting factory run on branch `diagnostic/f10-isr-unblock-boundary`, branched +from clean `diagnostic/f9-completion-boundary`. + +Beads tracking attempted first, but `bd create` failed because the local Dolt +runtime server reports `database "breenix" not found on Dolt server at +127.0.0.1:63842`. Continuing with the factory run and recording that blocker +in exit documentation, matching F9's known gap. + +Starting milestone 1: add ring-only breadcrumb sites inside +`scheduler::isr_unblock_for_io(tid)` and extend AHCI site-tag display. Scope is +limited to `kernel/src/drivers/ahci/mod.rs` and `kernel/src/task/scheduler.rs`. + +Implemented milestone 1 code changes: +- `kernel/src/drivers/ahci/mod.rs`: added `AHCI_TRACE_UNBLOCK_*` constants and + display names. +- `kernel/src/task/scheduler.rs`: added seven `push_ahci_event()` breadcrumbs in + `isr_unblock_for_io(tid)`, with `UNBLOCK_PER_SGI` carrying target CPU in + `slot_mask`. + +Starting milestone 1 validation: format/diff hygiene and clean aarch64 build. + +Milestone 1 validation results: +- `git diff --check`: passed. +- `cargo build --release --target aarch64-breenix.json -Z build-std=core,alloc -Z build-std-features=compiler-builtins-mem -p kernel --bin kernel-aarch64`: passed. +- `grep -E '^(warning|error)' /tmp/f10-aarch64-build.log`: no output. +- `cargo fmt --check`: failed on pre-existing unrelated formatting/trailing + whitespace in files outside this F10 scope. No broad formatting applied. + +Committed milestone 1 as: +`diagnostic(arm64): isr_unblock_for_io internal ring sites`. + +Starting milestone 2: run the 5x Parallels sweep, summarize the new +`UNBLOCK_*` counts, append the investigation doc, and write exit documentation. + +Sweep results: +- `run1`: timeout sample. Stuck token `1212` / waiter `tid=11`, last site + `UNBLOCK_PER_SGI`, target CPU encoded as `slot_mask=0x2`. +- `run2`: timeout sample. Stuck token `1263` / waiter `tid=13`, last site + `UNBLOCK_AFTER_CPU`; no `UNBLOCK_AFTER_BUFFER` retained for that stuck call. +- `run3`: timeout sample. Stuck token `1396` / waiter `tid=13`, last site + `UNBLOCK_PER_SGI`, target CPU encoded as `slot_mask=0x1`. +- `run4`: reached `bsshd`, no AHCI timeout/ring output in the retained window. +- `run5`: timeout output present, but retained ring head showed token `1315` + completing through `UNBLOCK_EXIT` / `WAKE_EXIT` / `AFTER_COMPLETE`. + +Decision: rank SGI delivery as the primary F11 direction because two +discriminating timeout samples stop immediately after `UNBLOCK_PER_SGI`, with +the wake-buffer push path retained as a secondary candidate due to `run2`. + diff --git a/docs/planning/ARM64_CPU0_SMP_INVESTIGATION.md b/docs/planning/ARM64_CPU0_SMP_INVESTIGATION.md index 0f4209ed..c5d14e19 100644 --- a/docs/planning/ARM64_CPU0_SMP_INVESTIGATION.md +++ b/docs/planning/ARM64_CPU0_SMP_INVESTIGATION.md @@ -2360,3 +2360,198 @@ wake_exit=2 - likely F10 split: if the last site is before/inside `send_sgi()`, audit SGI delivery/GICR targeting; if it is before `set_need_resched()` or buffer push, audit the per-CPU wake buffer atomics + +### 2026-04-16: F10 `isr_unblock_for_io()` Internal Breadcrumb Diagnostic + +- Probe change summary: + - branched from `diagnostic/f9-completion-boundary` to + `diagnostic/f10-isr-unblock-boundary` + - extended the existing AHCI trace ring in + [ahci/mod.rs](/Users/wrb/fun/code/breenix-worktrees/f9-completion-boundary/kernel/src/drivers/ahci/mod.rs) + with `UNBLOCK_ENTRY`, `UNBLOCK_AFTER_CPU`, `UNBLOCK_AFTER_BUFFER`, + `UNBLOCK_AFTER_NEED_RESCHED`, `UNBLOCK_BEFORE_SGI_SCAN`, + `UNBLOCK_PER_SGI`, and `UNBLOCK_EXIT` + - instrumented + [scheduler.rs](/Users/wrb/fun/code/breenix-worktrees/f9-completion-boundary/kernel/src/task/scheduler.rs) + `isr_unblock_for_io(tid)` with ring-only breadcrumbs at the requested + internal boundaries + - encoded `UNBLOCK_PER_SGI` target CPU in the existing `slot_mask` field; + non-SGI `UNBLOCK_*` entries leave `slot_mask=0` +- Validation: + - clean aarch64 build: + `cargo build --release --target aarch64-breenix.json -Z build-std=core,alloc -Z build-std-features=compiler-builtins-mem -p kernel --bin kernel-aarch64` + - `grep -E '^(warning|error)' /tmp/f10-aarch64-build.log` produced no output + - `git diff --check` passed + - repo-wide `cargo fmt --check` still fails on pre-existing unrelated + formatting/trailing-whitespace issues; no broad formatter was applied +- Artifacts: + - [run1](/Users/wrb/fun/code/breenix-worktrees/f9-completion-boundary/logs/breenix-parallels-cpu0/f10-isr-unblock/run1) + - [run2](/Users/wrb/fun/code/breenix-worktrees/f9-completion-boundary/logs/breenix-parallels-cpu0/f10-isr-unblock/run2) + - [run3](/Users/wrb/fun/code/breenix-worktrees/f9-completion-boundary/logs/breenix-parallels-cpu0/f10-isr-unblock/run3) + - [run4](/Users/wrb/fun/code/breenix-worktrees/f9-completion-boundary/logs/breenix-parallels-cpu0/f10-isr-unblock/run4) + - [run5](/Users/wrb/fun/code/breenix-worktrees/f9-completion-boundary/logs/breenix-parallels-cpu0/f10-isr-unblock/run5) +- Sweep summaries: + +```text +run1: +exit_status=1 +ahci_timeouts=14 +ahci_ring_entries=64 +ahci_port0_is=2 +ahci_port1_is=2 +bsshd_started=1 +before_complete=10 +after_complete=8 +wake_enter=2 +wake_exit=2 +unblock_entry=2 +unblock_after_cpu=2 +unblock_after_buffer=2 +unblock_after_need_resched=2 +unblock_before_sgi_scan=2 +unblock_per_sgi=6 +unblock_exit=2 + +run2: +exit_status=1 +ahci_timeouts=74 +ahci_ring_entries=64 +ahci_port0_is=2 +ahci_port1_is=2 +bsshd_started=1 +before_complete=10 +after_complete=8 +wake_enter=2 +wake_exit=2 +unblock_entry=2 +unblock_after_cpu=2 +unblock_after_buffer=0 +unblock_after_need_resched=0 +unblock_before_sgi_scan=0 +unblock_per_sgi=12 +unblock_exit=2 + +run3: +exit_status=1 +ahci_timeouts=1227 +ahci_ring_entries=31 +ahci_port0_is=1 +ahci_port1_is=1 +bsshd_started=1 +before_complete=4 +after_complete=4 +wake_enter=1 +wake_exit=1 +unblock_entry=1 +unblock_after_cpu=1 +unblock_after_buffer=1 +unblock_after_need_resched=1 +unblock_before_sgi_scan=1 +unblock_per_sgi=3 +unblock_exit=1 + +run4: +exit_status=1 +ahci_timeouts=0 +ahci_ring_entries=0 +ahci_port0_is=0 +ahci_port1_is=0 +bsshd_started=1 +before_complete=0 +after_complete=0 +wake_enter=0 +wake_exit=0 +unblock_entry=0 +unblock_after_cpu=0 +unblock_after_buffer=0 +unblock_after_need_resched=0 +unblock_before_sgi_scan=0 +unblock_per_sgi=0 +unblock_exit=0 + +run5: +exit_status=1 +ahci_timeouts=670 +ahci_ring_entries=64 +ahci_port0_is=2 +ahci_port1_is=2 +bsshd_started=1 +before_complete=6 +after_complete=8 +wake_enter=2 +wake_exit=2 +unblock_entry=2 +unblock_after_cpu=2 +unblock_after_buffer=2 +unblock_after_need_resched=2 +unblock_before_sgi_scan=2 +unblock_per_sgi=12 +unblock_exit=2 +``` + +- Primary stalled-token extract from `run1`: + +```text +[ahci] Port 1 TIMEOUT (5s): CI=0x0 IS=0x1 TFD=0x40 HBA_IS=0x2 +[ahci] GIC: SPI34 pend=true act=true DAIF=0x300 pend_snap=[0x800004,0x0,0x0] +[ahci] isr_count=1210 cmd#=1214 completion_done=0 PMR=0xf8 RPR=0xff +[ahci] port_isr_hits=[1,1209] complete_hits=[0,3] +[ahci] isr_last_pmr=0xf8 last_port1_IS=0x1 last_port1_cmd_num=1212 +[ahci] read_block(522) wait failed: AHCI: command timeout +[STUCK_SPI34] cpu=3 gic_version=3 +[STUCK_SPI34] GICD_ISPENDR[1]=0x800004 bit=2 pending=true +[STUCK_SPI34] GICD_ISACTIVER[1]=0x4 bit=2 active=true +[STUCK_SPI34] ICC_RPR_EL1=0xff +[STUCK_SPI34] ICC_PMR_EL1=0xf8 +[STUCK_SPI34] DAIF=0x300 +[STUCK_SPI34] AHCI_PORT0_IS=0x0 +[STUCK_SPI34] AHCI_PORT1_IS=0x1 +[AHCI_RING] nsec=2379603791 site=UNBLOCK_PER_SGI cpu=0 port=0 IS=0x0 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x2 token=0 wake_success=0 seq=3677 +[AHCI_RING] nsec=2379602916 site=UNBLOCK_BEFORE_SGI_SCAN cpu=0 port=0 IS=0x0 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x0 token=0 wake_success=0 seq=3676 +[AHCI_RING] nsec=2379602041 site=UNBLOCK_AFTER_NEED_RESCHED cpu=0 port=0 IS=0x0 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x0 token=0 wake_success=0 seq=3675 +[AHCI_RING] nsec=2379601166 site=UNBLOCK_AFTER_BUFFER cpu=0 port=0 IS=0x0 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x0 token=0 wake_success=0 seq=3674 +[AHCI_RING] nsec=2379600291 site=UNBLOCK_AFTER_CPU cpu=0 port=0 IS=0x0 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x0 token=0 wake_success=0 seq=3673 +[AHCI_RING] nsec=2379599416 site=UNBLOCK_ENTRY cpu=0 port=0 IS=0x0 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x0 token=0 wake_success=0 seq=3672 +[AHCI_RING] nsec=2379598541 site=WAKE_ENTER cpu=0 port=0 IS=0x0 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x0 token=1212 wake_success=0 seq=3671 +[AHCI_RING] nsec=2379597666 site=BEFORE_COMPLETE cpu=0 port=1 IS=0x0 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x1 token=1212 wake_success=0 seq=3670 +[AHCI_RING] nsec=2379596750 site=POST_CLEAR cpu=0 port=1 IS=0x0 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x1 token=1212 wake_success=0 seq=3669 +[AHCI_RING] nsec=2379589250 site=ENTER cpu=0 port=1 IS=0x1 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x0 token=1212 wake_success=0 seq=3668 +[AHCI_RING] nsec=2378591666 site=RETURN cpu=0 port=1 IS=0x0 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x1 token=1211 wake_success=1 seq=3667 +[AHCI_RING] nsec=2378590416 site=AFTER_COMPLETE cpu=0 port=1 IS=0x0 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x1 token=1211 wake_success=0 seq=3666 +[AHCI_RING] nsec=2378589375 site=WAKE_EXIT cpu=0 port=0 IS=0x0 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x0 token=1211 wake_success=0 seq=3665 +[AHCI_RING] nsec=2378588333 site=UNBLOCK_EXIT cpu=0 port=0 IS=0x0 CI=0x0 SACT=0x0 SERR=0x0 waiter_tid=11 slot_mask=0x0 token=0 wake_success=0 seq=3664 +``` + +- Last-site verdict: + - primary sample `run1`: stuck token `1212`, waiter `tid=11`, reached + `UNBLOCK_PER_SGI` with `slot_mask=0x2` (target CPU 2) and never emitted + another `UNBLOCK_PER_SGI`, `UNBLOCK_EXIT`, `WAKE_EXIT`, `AFTER_COMPLETE`, + or `RETURN` for that token + - corroborating sample `run3`: stuck token `1396`, waiter `tid=13`, reached + `UNBLOCK_PER_SGI` with `slot_mask=0x1` (target CPU 1) and did not emit + `UNBLOCK_EXIT` / `WAKE_EXIT` for that token + - alternate sample `run2`: stuck token `1263`, waiter `tid=13`, reached + `UNBLOCK_AFTER_CPU` and did not emit `UNBLOCK_AFTER_BUFFER`; this keeps the + wake-buffer push path on the candidate list, but it was not the majority + signature in this sweep +- Candidate root-cause ranking: + 1. SGI delivery path: strongest current lead. Two timeout captures stop after + `UNBLOCK_PER_SGI`, which is emitted immediately before + `gic::send_sgi(SGI_RESCHEDULE, target)`. The target CPU is visible in + `slot_mask` (`0x2` in `run1`, `0x1` in `run3`). + 2. Wake-buffer push path: one timeout capture stops after + `UNBLOCK_AFTER_CPU`, before `UNBLOCK_AFTER_BUFFER`, implying a possible + stall inside `ISR_WAKEUP_BUFFERS[cpu].push(tid)` or an adjacent memory + corruption/ring-retention artifact. + 3. `set_need_resched()` / SGI scan predicate: lower probability. The primary + samples pass `UNBLOCK_AFTER_NEED_RESCHED` and + `UNBLOCK_BEFORE_SGI_SCAN` before stalling. +- F11 recommendation: + - primary F11 probe direction: audit SGI delivery by instrumenting + `gic::send_sgi()` / SGI target construction with ring-only breadcrumbs + before and after each architectural write, preserving the target CPU in the + ring entry + - secondary F11 guardrail: keep one minimal breadcrumb around the + `IsrWakeupBuffer::push()` CAS loop or add a bounded push-site counter so the + `run2` `UNBLOCK_AFTER_CPU` signature can be confirmed or eliminated without + changing scheduler semantics