From 54e29c3bd15cae215c4d2987d58e512a5f80bc40 Mon Sep 17 00:00:00 2001 From: David Ndungu Date: Tue, 14 Apr 2026 19:07:32 -0700 Subject: [PATCH 1/2] chore(debug): T98.3.2 remove FusedRMSNormGPU sub-step probes Probes were added in T98.2.2 to pin the corrupting sub-step inside FusedRMSNormGPU. Bug was found (devIn=0x0, pass-through aliasing issue) and fixed in T98.2.3. Probes no longer needed. Refs E98 T98.3.2. --- .claude/settings.local.json | 7 ++++++ compute/gpu_fused_rmsnorm.go | 5 ----- compute/gpu_fused_rmsnorm_debug.go | 35 ------------------------------ 3 files changed, 7 insertions(+), 40 deletions(-) create mode 100644 .claude/settings.local.json delete mode 100644 compute/gpu_fused_rmsnorm_debug.go diff --git a/.claude/settings.local.json b/.claude/settings.local.json new file mode 100644 index 0000000..13ef07f --- /dev/null +++ b/.claude/settings.local.json @@ -0,0 +1,7 @@ +{ + "permissions": { + "additionalDirectories": [ + "/Users/dndungu/Code/zerfoo/zerfoo" + ] + } +} diff --git a/compute/gpu_fused_rmsnorm.go b/compute/gpu_fused_rmsnorm.go index 7800fd9..c9bbe04 100644 --- a/compute/gpu_fused_rmsnorm.go +++ b/compute/gpu_fused_rmsnorm.go @@ -40,7 +40,6 @@ func (e *GPUEngine[T]) FusedRMSNormGPU(input, weight *tensor.TensorNumeric[float return FusedRMSNorm(input, weight, epsilon) } defer cleanupIn() - fusedRMSNormProbe("entry:devIn", e.runtime, e.stream, devIn, total*f32Size) // Get weight device pointer. Weight may be Float16Storage (from FP16 // weight upload), GPUStorage[float32], or CPU-resident. @@ -76,14 +75,12 @@ func (e *GPUEngine[T]) FusedRMSNormGPU(input, weight *tensor.TensorNumeric[float } } defer cleanupWeight() - fusedRMSNormProbe("after:weightToGPU", e.runtime, e.stream, devWeight, weight.Size()*f32Size) outByteSize := total * f32Size devOut, err := e.pool.Alloc(e.deviceID, outByteSize) if err != nil { return FusedRMSNorm(input, weight, epsilon) } - fusedRMSNormProbe("after:allocDevOut", e.runtime, e.stream, devOut, outByteSize) scalesByteSize := rows * f32Size devScales, err := e.pool.Alloc(e.deviceID, scalesByteSize) @@ -91,14 +88,12 @@ func (e *GPUEngine[T]) FusedRMSNormGPU(input, weight *tensor.TensorNumeric[float e.pool.Free(e.deviceID, devOut, outByteSize) return FusedRMSNorm(input, weight, epsilon) } - fusedRMSNormProbe("after:allocDevScales", e.runtime, e.stream, devScales, scalesByteSize) if err := e.kernels.RMSNorm(devIn, devWeight, devOut, devScales, epsilon, rows, D, e.stream); err != nil { e.pool.Free(e.deviceID, devOut, outByteSize) e.pool.Free(e.deviceID, devScales, scalesByteSize) return nil, nil, err } - fusedRMSNormProbe("after:kernelRMSNorm", e.runtime, e.stream, devOut, outByteSize) outTensor, err := makeGPUResult[float32](f32Engine, shape, devOut, total) if err != nil { diff --git a/compute/gpu_fused_rmsnorm_debug.go b/compute/gpu_fused_rmsnorm_debug.go deleted file mode 100644 index 15ebadc..0000000 --- a/compute/gpu_fused_rmsnorm_debug.go +++ /dev/null @@ -1,35 +0,0 @@ -package compute - -import ( - "fmt" - "os" - "unsafe" - - "github.com/zerfoo/ztensor/internal/gpuapi" -) - -// fusedRMSNormDebug is gated on ZERFOO_GQA_DEBUG=1 (same env var used by the -// zerfoo-side GQA / PLE instrumentation) so we can bisect the corrupting -// sub-step inside FusedRMSNormGPU for E98.T98.2.2 without spamming prod logs. -var fusedRMSNormDebug = os.Getenv("ZERFOO_GQA_DEBUG") == "1" - -// fusedRMSNormProbe force-syncs the stream and issues a 1-byte D2H cudaMemcpy -// from ptr. If CUDA is in a sticky error state, either call surfaces the error -// immediately, which lets us pin the offending sub-step inside -// FusedRMSNormGPU. No-op when the debug env gate is off. -func fusedRMSNormProbe(tag string, runtime gpuapi.Runtime, stream gpuapi.Stream, ptr unsafe.Pointer, byteLen int) { - if !fusedRMSNormDebug { - return - } - var syncErr error - if stream != nil { - syncErr = stream.Synchronize() - } - var memcpyErr error - if ptr != nil && byteLen > 0 && runtime != nil { - var probe [1]byte - memcpyErr = runtime.Memcpy(unsafe.Pointer(&probe[0]), ptr, 1, gpuapi.MemcpyDeviceToHost) - } - fmt.Fprintf(os.Stderr, "[RMS_DBG] %s gpuPtr=%p bytes=%d sync=%v memcpy=%v\n", - tag, ptr, byteLen, syncErr, memcpyErr) -} From 12704687d5d62e8ae6bdc6a503af26a7dbae7324 Mon Sep 17 00:00:00 2001 From: David Ndungu Date: Tue, 14 Apr 2026 19:07:43 -0700 Subject: [PATCH 2/2] chore: ignore .claude/ local settings --- .claude/settings.local.json | 7 ------- .gitignore | 1 + 2 files changed, 1 insertion(+), 7 deletions(-) delete mode 100644 .claude/settings.local.json diff --git a/.claude/settings.local.json b/.claude/settings.local.json deleted file mode 100644 index 13ef07f..0000000 --- a/.claude/settings.local.json +++ /dev/null @@ -1,7 +0,0 @@ -{ - "permissions": { - "additionalDirectories": [ - "/Users/dndungu/Code/zerfoo/zerfoo" - ] - } -} diff --git a/.gitignore b/.gitignore index f913dd0..9271130 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,4 @@ go.work.sum # Object files *.o +.claude/