diff --git a/.gitignore b/.gitignore
index f913dd0..9271130 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,4 @@ go.work.sum
 
 # Object files
 *.o
+.claude/
diff --git a/compute/gpu_fused_rmsnorm.go b/compute/gpu_fused_rmsnorm.go
index 7800fd9..c9bbe04 100644
--- a/compute/gpu_fused_rmsnorm.go
+++ b/compute/gpu_fused_rmsnorm.go
@@ -40,7 +40,6 @@ func (e *GPUEngine[T]) FusedRMSNormGPU(input, weight *tensor.TensorNumeric[float
 		return FusedRMSNorm(input, weight, epsilon)
 	}
 	defer cleanupIn()
-	fusedRMSNormProbe("entry:devIn", e.runtime, e.stream, devIn, total*f32Size)
 
 	// Get weight device pointer. Weight may be Float16Storage (from FP16
 	// weight upload), GPUStorage[float32], or CPU-resident.
@@ -76,14 +75,12 @@ func (e *GPUEngine[T]) FusedRMSNormGPU(input, weight *tensor.TensorNumeric[float
 		}
 	}
 	defer cleanupWeight()
-	fusedRMSNormProbe("after:weightToGPU", e.runtime, e.stream, devWeight, weight.Size()*f32Size)
 
 	outByteSize := total * f32Size
 	devOut, err := e.pool.Alloc(e.deviceID, outByteSize)
 	if err != nil {
 		return FusedRMSNorm(input, weight, epsilon)
 	}
-	fusedRMSNormProbe("after:allocDevOut", e.runtime, e.stream, devOut, outByteSize)
 
 	scalesByteSize := rows * f32Size
 	devScales, err := e.pool.Alloc(e.deviceID, scalesByteSize)
@@ -91,14 +88,12 @@ func (e *GPUEngine[T]) FusedRMSNormGPU(input, weight *tensor.TensorNumeric[float
 		e.pool.Free(e.deviceID, devOut, outByteSize)
 		return FusedRMSNorm(input, weight, epsilon)
 	}
-	fusedRMSNormProbe("after:allocDevScales", e.runtime, e.stream, devScales, scalesByteSize)
 
 	if err := e.kernels.RMSNorm(devIn, devWeight, devOut, devScales, epsilon, rows, D, e.stream); err != nil {
 		e.pool.Free(e.deviceID, devOut, outByteSize)
 		e.pool.Free(e.deviceID, devScales, scalesByteSize)
 		return nil, nil, err
 	}
-	fusedRMSNormProbe("after:kernelRMSNorm", e.runtime, e.stream, devOut, outByteSize)
 
 	outTensor, err := makeGPUResult[float32](f32Engine, shape, devOut, total)
 	if err != nil {
diff --git a/compute/gpu_fused_rmsnorm_debug.go b/compute/gpu_fused_rmsnorm_debug.go
deleted file mode 100644
index 15ebadc..0000000
--- a/compute/gpu_fused_rmsnorm_debug.go
+++ /dev/null
@@ -1,35 +0,0 @@
-package compute
-
-import (
-	"fmt"
-	"os"
-	"unsafe"
-
-	"github.com/zerfoo/ztensor/internal/gpuapi"
-)
-
-// fusedRMSNormDebug is gated on ZERFOO_GQA_DEBUG=1 (same env var used by the
-// zerfoo-side GQA / PLE instrumentation) so we can bisect the corrupting
-// sub-step inside FusedRMSNormGPU for E98.T98.2.2 without spamming prod logs.
-var fusedRMSNormDebug = os.Getenv("ZERFOO_GQA_DEBUG") == "1"
-
-// fusedRMSNormProbe force-syncs the stream and issues a 1-byte D2H cudaMemcpy
-// from ptr. If CUDA is in a sticky error state, either call surfaces the error
-// immediately, which lets us pin the offending sub-step inside
-// FusedRMSNormGPU. No-op when the debug env gate is off.
-func fusedRMSNormProbe(tag string, runtime gpuapi.Runtime, stream gpuapi.Stream, ptr unsafe.Pointer, byteLen int) {
-	if !fusedRMSNormDebug {
-		return
-	}
-	var syncErr error
-	if stream != nil {
-		syncErr = stream.Synchronize()
-	}
-	var memcpyErr error
-	if ptr != nil && byteLen > 0 && runtime != nil {
-		var probe [1]byte
-		memcpyErr = runtime.Memcpy(unsafe.Pointer(&probe[0]), ptr, 1, gpuapi.MemcpyDeviceToHost)
-	}
-	fmt.Fprintf(os.Stderr, "[RMS_DBG] %s gpuPtr=%p bytes=%d sync=%v memcpy=%v\n",
-		tag, ptr, byteLen, syncErr, memcpyErr)
-}