diff --git a/.gitignore b/.gitignore index f913dd0..9271130 100644 --- a/.gitignore +++ b/.gitignore @@ -26,3 +26,4 @@ go.work.sum # Object files *.o +.claude/ diff --git a/compute/gpu_fused_rmsnorm.go b/compute/gpu_fused_rmsnorm.go index 7800fd9..c9bbe04 100644 --- a/compute/gpu_fused_rmsnorm.go +++ b/compute/gpu_fused_rmsnorm.go @@ -40,7 +40,6 @@ func (e *GPUEngine[T]) FusedRMSNormGPU(input, weight *tensor.TensorNumeric[float return FusedRMSNorm(input, weight, epsilon) } defer cleanupIn() - fusedRMSNormProbe("entry:devIn", e.runtime, e.stream, devIn, total*f32Size) // Get weight device pointer. Weight may be Float16Storage (from FP16 // weight upload), GPUStorage[float32], or CPU-resident. @@ -76,14 +75,12 @@ func (e *GPUEngine[T]) FusedRMSNormGPU(input, weight *tensor.TensorNumeric[float } } defer cleanupWeight() - fusedRMSNormProbe("after:weightToGPU", e.runtime, e.stream, devWeight, weight.Size()*f32Size) outByteSize := total * f32Size devOut, err := e.pool.Alloc(e.deviceID, outByteSize) if err != nil { return FusedRMSNorm(input, weight, epsilon) } - fusedRMSNormProbe("after:allocDevOut", e.runtime, e.stream, devOut, outByteSize) scalesByteSize := rows * f32Size devScales, err := e.pool.Alloc(e.deviceID, scalesByteSize) @@ -91,14 +88,12 @@ func (e *GPUEngine[T]) FusedRMSNormGPU(input, weight *tensor.TensorNumeric[float e.pool.Free(e.deviceID, devOut, outByteSize) return FusedRMSNorm(input, weight, epsilon) } - fusedRMSNormProbe("after:allocDevScales", e.runtime, e.stream, devScales, scalesByteSize) if err := e.kernels.RMSNorm(devIn, devWeight, devOut, devScales, epsilon, rows, D, e.stream); err != nil { e.pool.Free(e.deviceID, devOut, outByteSize) e.pool.Free(e.deviceID, devScales, scalesByteSize) return nil, nil, err } - fusedRMSNormProbe("after:kernelRMSNorm", e.runtime, e.stream, devOut, outByteSize) outTensor, err := makeGPUResult[float32](f32Engine, shape, devOut, total) if err != nil { diff --git a/compute/gpu_fused_rmsnorm_debug.go b/compute/gpu_fused_rmsnorm_debug.go deleted file mode 100644 index 15ebadc..0000000 --- a/compute/gpu_fused_rmsnorm_debug.go +++ /dev/null @@ -1,35 +0,0 @@ -package compute - -import ( - "fmt" - "os" - "unsafe" - - "github.com/zerfoo/ztensor/internal/gpuapi" -) - -// fusedRMSNormDebug is gated on ZERFOO_GQA_DEBUG=1 (same env var used by the -// zerfoo-side GQA / PLE instrumentation) so we can bisect the corrupting -// sub-step inside FusedRMSNormGPU for E98.T98.2.2 without spamming prod logs. -var fusedRMSNormDebug = os.Getenv("ZERFOO_GQA_DEBUG") == "1" - -// fusedRMSNormProbe force-syncs the stream and issues a 1-byte D2H cudaMemcpy -// from ptr. If CUDA is in a sticky error state, either call surfaces the error -// immediately, which lets us pin the offending sub-step inside -// FusedRMSNormGPU. No-op when the debug env gate is off. -func fusedRMSNormProbe(tag string, runtime gpuapi.Runtime, stream gpuapi.Stream, ptr unsafe.Pointer, byteLen int) { - if !fusedRMSNormDebug { - return - } - var syncErr error - if stream != nil { - syncErr = stream.Synchronize() - } - var memcpyErr error - if ptr != nil && byteLen > 0 && runtime != nil { - var probe [1]byte - memcpyErr = runtime.Memcpy(unsafe.Pointer(&probe[0]), ptr, 1, gpuapi.MemcpyDeviceToHost) - } - fmt.Fprintf(os.Stderr, "[RMS_DBG] %s gpuPtr=%p bytes=%d sync=%v memcpy=%v\n", - tag, ptr, byteLen, syncErr, memcpyErr) -}