From 5c1d9ebfc5651b81774ac855a6443d3c778479c4 Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Mon, 13 Apr 2026 15:53:37 -0700
Subject: [PATCH] Refactor stridingMemcpyKernel to be unroll friendly

While loop over incrementing pointer is harder for compiler to analyze.
Switching to directly indexed pointer accesses gives compiler more
flexibility over unrolling the loop.

Also, work around clang-related issue which results in slow code when
aggregates are swapped via a temporary variable. Copying uint4
element-wise avoids the problem, and works fine with NVCC.
---
 kernels.cu | 53 +++++++++++++++++++----------------------------------
 1 file changed, 19 insertions(+), 34 deletions(-)

diff --git a/kernels.cu b/kernels.cu
index de4df24..4785dc5 100644
--- a/kernels.cu
+++ b/kernels.cu
@@ -27,48 +27,33 @@ __global__ void simpleCopyKernel(unsigned long long loopCount, uint4 *dst, uint4
     }
 }
 
+constexpr int UNROLL_FACTOR = 12;
+
 __global__ void stridingMemcpyKernel(unsigned int totalThreadCount, unsigned long long loopCount, uint4* dst, uint4* src, size_t chunkSizeInElement) {
     unsigned long long from = blockDim.x * blockIdx.x + threadIdx.x;
-    unsigned long long bigChunkSizeInElement = chunkSizeInElement / 12;
+    unsigned long long bigChunkSizeInElement = chunkSizeInElement / UNROLL_FACTOR;
     dst += from;
     src += from;
-    uint4* dstBigEnd = dst + (bigChunkSizeInElement * 12) * totalThreadCount;
-    uint4* dstEnd = dst + chunkSizeInElement * totalThreadCount;
+    unsigned long long processedElements = bigChunkSizeInElement * UNROLL_FACTOR;
 
     for (unsigned int i = 0; i < loopCount; i++) {
-        uint4* cdst = dst;
-        uint4* csrc = src;
-
-        while (cdst < dstBigEnd) {
-            uint4 pipe_0 = *csrc; csrc += totalThreadCount;
-            uint4 pipe_1 = *csrc; csrc += totalThreadCount;
-            uint4 pipe_2 = *csrc; csrc += totalThreadCount;
-            uint4 pipe_3 = *csrc; csrc += totalThreadCount;
-            uint4 pipe_4 = *csrc; csrc += totalThreadCount;
-            uint4 pipe_5 = *csrc; csrc += totalThreadCount;
-            uint4 pipe_6 = *csrc; csrc += totalThreadCount;
-            uint4 pipe_7 = *csrc; csrc += totalThreadCount;
-            uint4 pipe_8 = *csrc; csrc += totalThreadCount;
-            uint4 pipe_9 = *csrc; csrc += totalThreadCount;
-            uint4 pipe_10 = *csrc; csrc += totalThreadCount;
-            uint4 pipe_11 = *csrc; csrc += totalThreadCount;
-
-            *cdst = pipe_0; cdst += totalThreadCount;
-            *cdst = pipe_1; cdst += totalThreadCount;
-            *cdst = pipe_2; cdst += totalThreadCount;
-            *cdst = pipe_3; cdst += totalThreadCount;
-            *cdst = pipe_4; cdst += totalThreadCount;
-            *cdst = pipe_5; cdst += totalThreadCount;
-            *cdst = pipe_6; cdst += totalThreadCount;
-            *cdst = pipe_7; cdst += totalThreadCount;
-            *cdst = pipe_8; cdst += totalThreadCount;
-            *cdst = pipe_9; cdst += totalThreadCount;
-            *cdst = pipe_10; cdst += totalThreadCount;
-            *cdst = pipe_11; cdst += totalThreadCount;
+        for (unsigned long long j = 0; j < bigChunkSizeInElement; j++) {
+            unsigned long long offset = j * UNROLL_FACTOR * totalThreadCount;
+
+            uint4 pipe[UNROLL_FACTOR];
+            #pragma unroll
+            for (int k = 0; k < UNROLL_FACTOR; ++k) {
+                pipe[k] = src[offset + k * totalThreadCount];
+            }
+            #pragma unroll
+            for (int k = 0; k < UNROLL_FACTOR; ++k) {
+                dst[offset + k * totalThreadCount] = {pipe[k].x, pipe[k].y, pipe[k].z, pipe[k].w};
+            }
         }
 
-        while (cdst < dstEnd) {
-            *cdst = *csrc; cdst += totalThreadCount; csrc += totalThreadCount;
+        #pragma unroll
+        for (unsigned long long j = processedElements; j < chunkSizeInElement; j++) {
+            dst[j * totalThreadCount] = src[j * totalThreadCount];
         }
     }
 }