diff --git a/kernels.cu b/kernels.cu index de4df24..4785dc5 100644 --- a/kernels.cu +++ b/kernels.cu @@ -27,48 +27,33 @@ __global__ void simpleCopyKernel(unsigned long long loopCount, uint4 *dst, uint4 } } +constexpr int UNROLL_FACTOR = 12; + __global__ void stridingMemcpyKernel(unsigned int totalThreadCount, unsigned long long loopCount, uint4* dst, uint4* src, size_t chunkSizeInElement) { unsigned long long from = blockDim.x * blockIdx.x + threadIdx.x; - unsigned long long bigChunkSizeInElement = chunkSizeInElement / 12; + unsigned long long bigChunkSizeInElement = chunkSizeInElement / UNROLL_FACTOR; dst += from; src += from; - uint4* dstBigEnd = dst + (bigChunkSizeInElement * 12) * totalThreadCount; - uint4* dstEnd = dst + chunkSizeInElement * totalThreadCount; + unsigned long long processedElements = bigChunkSizeInElement * UNROLL_FACTOR; for (unsigned int i = 0; i < loopCount; i++) { - uint4* cdst = dst; - uint4* csrc = src; - - while (cdst < dstBigEnd) { - uint4 pipe_0 = *csrc; csrc += totalThreadCount; - uint4 pipe_1 = *csrc; csrc += totalThreadCount; - uint4 pipe_2 = *csrc; csrc += totalThreadCount; - uint4 pipe_3 = *csrc; csrc += totalThreadCount; - uint4 pipe_4 = *csrc; csrc += totalThreadCount; - uint4 pipe_5 = *csrc; csrc += totalThreadCount; - uint4 pipe_6 = *csrc; csrc += totalThreadCount; - uint4 pipe_7 = *csrc; csrc += totalThreadCount; - uint4 pipe_8 = *csrc; csrc += totalThreadCount; - uint4 pipe_9 = *csrc; csrc += totalThreadCount; - uint4 pipe_10 = *csrc; csrc += totalThreadCount; - uint4 pipe_11 = *csrc; csrc += totalThreadCount; - - *cdst = pipe_0; cdst += totalThreadCount; - *cdst = pipe_1; cdst += totalThreadCount; - *cdst = pipe_2; cdst += totalThreadCount; - *cdst = pipe_3; cdst += totalThreadCount; - *cdst = pipe_4; cdst += totalThreadCount; - *cdst = pipe_5; cdst += totalThreadCount; - *cdst = pipe_6; cdst += totalThreadCount; - *cdst = pipe_7; cdst += totalThreadCount; - *cdst = pipe_8; cdst += totalThreadCount; - *cdst = pipe_9; cdst += totalThreadCount; - *cdst = pipe_10; cdst += totalThreadCount; - *cdst = pipe_11; cdst += totalThreadCount; + for (unsigned long long j = 0; j < bigChunkSizeInElement; j++) { + unsigned long long offset = j * UNROLL_FACTOR * totalThreadCount; + + uint4 pipe[UNROLL_FACTOR]; + #pragma unroll + for (int k = 0; k < UNROLL_FACTOR; ++k) { + pipe[k] = src[offset + k * totalThreadCount]; + } + #pragma unroll + for (int k = 0; k < UNROLL_FACTOR; ++k) { + dst[offset + k * totalThreadCount] = {pipe[k].x, pipe[k].y, pipe[k].z, pipe[k].w}; + } } - while (cdst < dstEnd) { - *cdst = *csrc; cdst += totalThreadCount; csrc += totalThreadCount; + #pragma unroll + for (unsigned long long j = processedElements; j < chunkSizeInElement; j++) { + dst[j * totalThreadCount] = src[j * totalThreadCount]; } } }