From 54e29c3bd15cae215c4d2987d58e512a5f80bc40 Mon Sep 17 00:00:00 2001
From: David Ndungu <ndungu.sink@gmail.com>
Date: Tue, 14 Apr 2026 19:07:32 -0700
Subject: [PATCH 1/2] chore(debug): T98.3.2 remove FusedRMSNormGPU sub-step
 probes

Probes were added in T98.2.2 to pin the corrupting sub-step inside
FusedRMSNormGPU. Bug was found (devIn=0x0, pass-through aliasing
issue) and fixed in T98.2.3. Probes no longer needed.

Refs E98 T98.3.2.
---
 .claude/settings.local.json        |  7 ++++++
 compute/gpu_fused_rmsnorm.go       |  5 -----
 compute/gpu_fused_rmsnorm_debug.go | 35 ------------------------------
 3 files changed, 7 insertions(+), 40 deletions(-)
 create mode 100644 .claude/settings.local.json
 delete mode 100644 compute/gpu_fused_rmsnorm_debug.go

diff --git a/.claude/settings.local.json b/.claude/settings.local.json
new file mode 100644
index 0000000..13ef07f
--- /dev/null
+++ b/.claude/settings.local.json
@@ -0,0 +1,7 @@
+{
+  "permissions": {
+    "additionalDirectories": [
+      "/Users/dndungu/Code/zerfoo/zerfoo"
+    ]
+  }
+}
diff --git a/compute/gpu_fused_rmsnorm.go b/compute/gpu_fused_rmsnorm.go
index 7800fd9..c9bbe04 100644
--- a/compute/gpu_fused_rmsnorm.go
+++ b/compute/gpu_fused_rmsnorm.go
@@ -40,7 +40,6 @@ func (e *GPUEngine[T]) FusedRMSNormGPU(input, weight *tensor.TensorNumeric[float
 		return FusedRMSNorm(input, weight, epsilon)
 	}
 	defer cleanupIn()
-	fusedRMSNormProbe("entry:devIn", e.runtime, e.stream, devIn, total*f32Size)
 
 	// Get weight device pointer. Weight may be Float16Storage (from FP16
 	// weight upload), GPUStorage[float32], or CPU-resident.
@@ -76,14 +75,12 @@ func (e *GPUEngine[T]) FusedRMSNormGPU(input, weight *tensor.TensorNumeric[float
 		}
 	}
 	defer cleanupWeight()
-	fusedRMSNormProbe("after:weightToGPU", e.runtime, e.stream, devWeight, weight.Size()*f32Size)
 
 	outByteSize := total * f32Size
 	devOut, err := e.pool.Alloc(e.deviceID, outByteSize)
 	if err != nil {
 		return FusedRMSNorm(input, weight, epsilon)
 	}
-	fusedRMSNormProbe("after:allocDevOut", e.runtime, e.stream, devOut, outByteSize)
 
 	scalesByteSize := rows * f32Size
 	devScales, err := e.pool.Alloc(e.deviceID, scalesByteSize)
@@ -91,14 +88,12 @@ func (e *GPUEngine[T]) FusedRMSNormGPU(input, weight *tensor.TensorNumeric[float
 		e.pool.Free(e.deviceID, devOut, outByteSize)
 		return FusedRMSNorm(input, weight, epsilon)
 	}
-	fusedRMSNormProbe("after:allocDevScales", e.runtime, e.stream, devScales, scalesByteSize)
 
 	if err := e.kernels.RMSNorm(devIn, devWeight, devOut, devScales, epsilon, rows, D, e.stream); err != nil {
 		e.pool.Free(e.deviceID, devOut, outByteSize)
 		e.pool.Free(e.deviceID, devScales, scalesByteSize)
 		return nil, nil, err
 	}
-	fusedRMSNormProbe("after:kernelRMSNorm", e.runtime, e.stream, devOut, outByteSize)
 
 	outTensor, err := makeGPUResult[float32](f32Engine, shape, devOut, total)
 	if err != nil {
diff --git a/compute/gpu_fused_rmsnorm_debug.go b/compute/gpu_fused_rmsnorm_debug.go
deleted file mode 100644
index 15ebadc..0000000
--- a/compute/gpu_fused_rmsnorm_debug.go
+++ /dev/null
@@ -1,35 +0,0 @@
-package compute
-
-import (
-	"fmt"
-	"os"
-	"unsafe"
-
-	"github.com/zerfoo/ztensor/internal/gpuapi"
-)
-
-// fusedRMSNormDebug is gated on ZERFOO_GQA_DEBUG=1 (same env var used by the
-// zerfoo-side GQA / PLE instrumentation) so we can bisect the corrupting
-// sub-step inside FusedRMSNormGPU for E98.T98.2.2 without spamming prod logs.
-var fusedRMSNormDebug = os.Getenv("ZERFOO_GQA_DEBUG") == "1"
-
-// fusedRMSNormProbe force-syncs the stream and issues a 1-byte D2H cudaMemcpy
-// from ptr. If CUDA is in a sticky error state, either call surfaces the error
-// immediately, which lets us pin the offending sub-step inside
-// FusedRMSNormGPU. No-op when the debug env gate is off.
-func fusedRMSNormProbe(tag string, runtime gpuapi.Runtime, stream gpuapi.Stream, ptr unsafe.Pointer, byteLen int) {
-	if !fusedRMSNormDebug {
-		return
-	}
-	var syncErr error
-	if stream != nil {
-		syncErr = stream.Synchronize()
-	}
-	var memcpyErr error
-	if ptr != nil && byteLen > 0 && runtime != nil {
-		var probe [1]byte
-		memcpyErr = runtime.Memcpy(unsafe.Pointer(&probe[0]), ptr, 1, gpuapi.MemcpyDeviceToHost)
-	}
-	fmt.Fprintf(os.Stderr, "[RMS_DBG] %s gpuPtr=%p bytes=%d sync=%v memcpy=%v\n",
-		tag, ptr, byteLen, syncErr, memcpyErr)
-}

From 12704687d5d62e8ae6bdc6a503af26a7dbae7324 Mon Sep 17 00:00:00 2001
From: David Ndungu <ndungu.sink@gmail.com>
Date: Tue, 14 Apr 2026 19:07:43 -0700
Subject: [PATCH 2/2] chore: ignore .claude/ local settings

---
 .claude/settings.local.json | 7 -------
 .gitignore                  | 1 +
 2 files changed, 1 insertion(+), 7 deletions(-)
 delete mode 100644 .claude/settings.local.json

diff --git a/.claude/settings.local.json b/.claude/settings.local.json
deleted file mode 100644
index 13ef07f..0000000
--- a/.claude/settings.local.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-  "permissions": {
-    "additionalDirectories": [
-      "/Users/dndungu/Code/zerfoo/zerfoo"
-    ]
-  }
-}
diff --git a/.gitignore b/.gitignore
index f913dd0..9271130 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,4 @@ go.work.sum
 
 # Object files
 *.o
+.claude/