From 996a4e2f425e67930f612c038e57db875329eb25 Mon Sep 17 00:00:00 2001 From: David Ndungu Date: Wed, 15 Apr 2026 19:34:36 -0700 Subject: [PATCH] feat(graph): T99.1.2 mark Gemma4PLECombinedProducer non-capturable Gemma4PLECombinedProducer performs a CPU-side gather over the shared PLE embedding table and then calls MulScalar on the freshly-allocated CPUStorage tensor. Inside a CUDA graph capture stream this triggers a synchronous H2D cudaMemcpy that CUDA rejects with "operation would make the legacy stream depend on a capturing blocking stream". Add the op to nonCapturableOps so the producer runs in pre-capture on every forward, outside the capturing stream. The producer runs once per forward pass before the transformer loop, so this placement keeps the layer-body capture region intact. Companion change in zerfoo/inference/gemma4_edge_ple_nodes.go (E99 T99.1.2) pre-slices the producer's outputs into stable GPU buffers so pleSliceNode stays fully capturable. Decision recorded in zerfoo/docs/adr/088-gemma4-ple-cuda-graph-capture.md. --- graph/cuda_graph.go | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/graph/cuda_graph.go b/graph/cuda_graph.go index 009aedf..f813dbb 100644 --- a/graph/cuda_graph.go +++ b/graph/cuda_graph.go @@ -44,14 +44,22 @@ var debugGraphCapture = os.Getenv("ZERFOO_DEBUG_GPU") == "1" // (offset_memcpy kernel) and GQA uses GPU RoPE selection (rope_select kernel), // all position-dependent state is read from GPU memory at replay time, making // GQA fully capturable. +// +// Gemma4PLECombinedProducer: performs a CPU-side gather over the shared PLE +// embedding table (token ids -> per-layer rows), then calls MulScalar on the +// freshly-allocated CPUStorage tensor. Running this inside a capture stream +// triggers a synchronous H2D cudaMemcpy that CUDA rejects. The producer runs +// once per forward pass before the transformer loop, so placing it in +// pre-capture keeps the layer-body capture region intact. See ADR-088. var nonCapturableOps = map[string]bool{ - "EmbeddingLookup": true, - "Gather": true, - "AutoAttentionMask": true, - "AutoPositionIds": true, - "Slice": true, - "ConstantOfShape": true, - "Shape": true, + "EmbeddingLookup": true, + "Gather": true, + "AutoAttentionMask": true, + "AutoPositionIds": true, + "Slice": true, + "ConstantOfShape": true, + "Shape": true, + "Gemma4PLECombinedProducer": true, } // isNonCapturable returns true if the instruction at index i in the plan