From c150e0b7397a6ae3f9dec2bcbf0e91b5829512d2 Mon Sep 17 00:00:00 2001 From: Oliver Thomson Brown <8394906+otbrown@users.noreply.github.com> Date: Fri, 24 Apr 2026 14:00:21 +0100 Subject: [PATCH 1/3] cpu_config.cpp: replaced omp_get_num_threads with omp_get_max_threads. Yes, it's confusing. Yes, the OpenMP ARB know. --- quest/src/cpu/cpu_config.cpp | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp index c11ec224..ad8e303a 100644 --- a/quest/src/cpu/cpu_config.cpp +++ b/quest/src/cpu/cpu_config.cpp @@ -79,9 +79,7 @@ int cpu_getAvailableNumThreads() { #if COMPILE_OPENMP int n = -1; - #pragma omp parallel shared(n) - #pragma omp single - n = omp_get_num_threads(); + n = omp_get_max_threads(); return n; #else From 9b8ddd1bb3d085d4a0d92b7f663b42fee138481e Mon Sep 17 00:00:00 2001 From: Oliver Thomson Brown <8394906+otbrown@users.noreply.github.com> Date: Fri, 24 Apr 2026 18:10:26 +0100 Subject: [PATCH 2/3] We do these things not because they are easy, but because we thought they would be easy. --- quest/include/environment.h | 8 ++ quest/src/api/environment.cpp | 11 +++ quest/src/gpu/gpu_config.cpp | 19 +++++ quest/src/gpu/gpu_config.hpp | 8 +- quest/src/gpu/gpu_kernels.cuh | 6 +- quest/src/gpu/gpu_subroutines.cpp | 121 ++++++++++++++++++++---------- 6 files changed, 124 insertions(+), 49 deletions(-) diff --git a/quest/include/environment.h b/quest/include/environment.h index 04f24bfe..a6724828 100644 --- a/quest/include/environment.h +++ b/quest/include/environment.h @@ -83,6 +83,14 @@ int isQuESTEnvInit(); QuESTEnv getQuESTEnv(); +/** @notyetdoced + * GPU thread per block control + * This is somehow probably the best pre-existing place for this. It only really applies to GPU, because for + * OpenMP the user can just export OMP_NUM_THREADS or call omp_set_num_threads. + */ +int getQuESTGpuThreadsPerBlock(); +void setQuESTGpuThreadsPerBlock(const int NEW_TPB); + // end de-mangler #ifdef __cplusplus diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp index 54149189..1f36ee64 100644 --- a/quest/src/api/environment.cpp +++ b/quest/src/api/environment.cpp @@ -509,5 +509,16 @@ void getEnvironmentString(char str[200]) { } +int getQuESTGpuThreadsPerBlock() { + QuESTEnv env = getQuESTEnv(); + return env.isGpuAccelerated? gpu_getNumThreadsPerBlock() : 0; +} + +void setQuESTGpuThreadsPerBlock(const int NEW_TPB) { + // just rely on the internal function to throw an error if there's no GPU support compiled + gpu_setNumThreadsPerBlock(NEW_TPB); + return; +} + // end de-mangler } diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp index c7db834b..78ef1a41 100644 --- a/quest/src/gpu/gpu_config.cpp +++ b/quest/src/gpu/gpu_config.cpp @@ -41,6 +41,7 @@ #include "quest/src/gpu/cuda_to_hip.hpp" #endif +int numThreadsPerBlock = 128; /* @@ -330,6 +331,24 @@ qindex gpu_getMaxNumConcurrentThreads() { * ENVIRONMENT MANAGEMENT */ +int gpu_getNumThreadsPerBlock() { +#if COMPILE_CUDA + return numThreadsPerBlock; +#else + error_gpuQueriedButGpuNotCompiled(); + return -1; +#endif +} + +void gpu_setNumThreadsPerBlock(const int NEW_TPB) { +#if COMPILE_CUDA + numThreadsPerBlock = NEW_TPB; +#else + error_gpuQueriedButGpuNotCompiled(); +#endif + return; +} + std::array getBoundGpuUuid() { #if COMPILE_CUDA diff --git a/quest/src/gpu/gpu_config.hpp b/quest/src/gpu/gpu_config.hpp index 1b3be629..866475cc 100644 --- a/quest/src/gpu/gpu_config.hpp +++ b/quest/src/gpu/gpu_config.hpp @@ -19,7 +19,6 @@ #include "quest/include/channels.h" - /* * CUDA ERROR HANDLING */ @@ -65,6 +64,10 @@ qindex gpu_getMaxNumConcurrentThreads(); * ENVIRONMENT MANAGEMENT */ +int gpu_getNumThreadsPerBlock(); + +void gpu_setNumThreadsPerBlock(const int NEW_TPB); + void gpu_bindLocalGPUsToNodes(); bool gpu_areAnyNodesBoundToSameGpu(); @@ -76,7 +79,6 @@ void gpu_initCuQuantum(); void gpu_finalizeCuQuantum(); - /* * MEMORY MANAGEMENT */ @@ -122,4 +124,4 @@ size_t gpu_getCacheMemoryInBytes(); -#endif // GPU_CONFIG_HPP \ No newline at end of file +#endif // GPU_CONFIG_HPP diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index 4f2a737e..7459235d 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -46,16 +46,12 @@ * THREAD MANAGEMENT */ - -const int NUM_THREADS_PER_BLOCK = 128; - - __forceinline__ __device__ qindex getThreadInd() { return blockIdx.x*blockDim.x + threadIdx.x; } -__host__ qindex getNumBlocks(qindex numThreads) { +__host__ qindex getNumBlocks(qindex numThreads, const int NUM_THREADS_PER_BLOCK) { /// @todo /// improve this with cudaOccupancyMaxPotentialBlockSize(), diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index 5e18048f..56b855c4 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -66,7 +66,6 @@ using std::vector; - /* * GETTERS */ @@ -141,7 +140,8 @@ qindex gpu_statevec_packAmpsIntoBuffer(Qureg qureg, vector qubits, vector>> ( @@ -208,7 +209,8 @@ void gpu_statevec_anyCtrlSwap_subA(Qureg qureg, vector ctrls, vector c #elif COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode / powerOf2(2 + ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); devints sortedQubits = util_getSorted(ctrls, {targ2, targ1}); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ2, targ1}, {0, 1}); @@ -232,7 +234,8 @@ void gpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector ctrls, vector c #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); qindex recvInd = getBufferRecvInd(); devints sortedCtrls = util_getSorted(ctrls); @@ -257,7 +260,8 @@ void gpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector ctrls, vector c #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / powerOf2(1 + ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); qindex recvInd = getBufferRecvInd(); devints sortedQubits = util_getSorted(ctrls, {targ}); @@ -299,7 +303,8 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v #elif COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 1); - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); devints sortedQubits = util_getSorted(ctrls, {targ}); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0}); @@ -326,7 +331,8 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector ctrls, v #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); qindex recvInd = getBufferRecvInd(); devints sortedCtrls = util_getSorted(ctrls); @@ -368,7 +374,8 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve #elif COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 2); - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); devints sortedQubits = util_getSorted(ctrls, {targ1,targ2}); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1,targ2}, {0,0}); @@ -463,7 +470,8 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve /// global memory) and greatly sabotage performance on some GPUs. qindex numThreads = numBatches; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); kernel_statevec_anyCtrlFewTargDenseMatr @@ -486,6 +494,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve // where we assign one-block per multiprocessor because we are anyway memory- // bandwidth bound (so we don't expect many interweaved blocks per MP). qindex numThreads = gpu_getMaxNumConcurrentThreads(); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); // use strictly 2^# threads to maintain precondition of all kernels if (!isPowerOf2(numThreads)) @@ -497,7 +506,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve // evenly distribute the batches between threads, and the threads unevenly between blocks qindex numBatchesPerThread = numBatches / numThreads; // divides evenly - qindex numBlocks = getNumBlocks(numThreads); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); // expand the cache if necessary qindex numKernelInvocations = numBlocks * NUM_THREADS_PER_BLOCK; @@ -566,7 +575,8 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector ctrls, vec /// efficient (because of improved parallelisation granularity) qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); devints deviceCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); @@ -634,7 +644,8 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector ctrls, vec /// efficient (because of improved parallelisation granularity) qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); devints deviceCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); @@ -702,7 +713,8 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec /// efficient (because of improved parallelisation granularity) qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); devints deviceTargs = targs; devints deviceCtrls = util_getSorted(ctrls); @@ -759,7 +771,8 @@ void gpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); kernel_densmatr_allTargDiagMatr_sub @@ -821,7 +834,8 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, vector ct // faster than when giving threads many pair-amps to modify, due to memory movements qindex numThreads = (qureg.numAmpsPerNode / powerOf2(ctrls.size())) / 2; // divides evenly - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); kernel_statevector_anyCtrlPauliTensorOrGadget_subA <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, getPtr(deviceQubits), ctrls.size(), qubitStateMask, @@ -843,7 +857,8 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector ct #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); qindex recvInd = getBufferRecvInd(); qcomp powI = util_getPowerOfI(y.size()); @@ -884,7 +899,8 @@ void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, vector c #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); devints sortedCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); @@ -917,7 +933,8 @@ void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector coeffs #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = outQureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); // extract amp ptrs from qureg list vector ptrs; @@ -957,7 +974,8 @@ void gpu_densmatr_mixQureg_subB(qreal outProb, Qureg outQureg, qreal inProb, Qur #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = outQureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); kernel_densmatr_mixQureg_subB <<>> ( outProb, toCuQcomps(outQureg.gpuAmps), inProb, toCuQcomps(inQureg.gpuAmps), @@ -975,7 +993,8 @@ void gpu_densmatr_mixQureg_subC(qreal outProb, Qureg outQureg, qreal inProb) { #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = outQureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); kernel_densmatr_mixQureg_subC <<>> ( outProb, toCuQcomps(outQureg.gpuAmps), inProb, toCuQcomps(outQureg.gpuCommBuffer), @@ -1007,7 +1026,8 @@ void gpu_densmatr_oneQubitDephasing_subA(Qureg qureg, int ketQubit, qreal prob) #elif COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode / 4; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); auto fac = util_getOneQubitDephasingFactor(prob); int braQubit = util_getBraQubit(ketQubit, qureg); @@ -1033,7 +1053,8 @@ void gpu_densmatr_oneQubitDephasing_subB(Qureg qureg, int ketQubit, qreal prob) #elif COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode / 2; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); auto fac = util_getOneQubitDephasingFactor(prob); int braBit = util_getRankBitOfBraQubit(ketQubit, qureg); @@ -1078,7 +1099,8 @@ void gpu_densmatr_twoQubitDephasing_subB(Qureg qureg, int ketQubitA, int ketQubi #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); auto term = util_getTwoQubitDephasingTerm(prob); int braQubitA = util_getBraQubit(ketQubitA, qureg); @@ -1106,7 +1128,8 @@ void gpu_densmatr_oneQubitDepolarising_subA(Qureg qureg, int ketQubit, qreal pro #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 4; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); int braQubit = util_getBraQubit(ketQubit, qureg); auto factors = util_getOneQubitDepolarisingFactors(prob); @@ -1126,7 +1149,8 @@ void gpu_densmatr_oneQubitDepolarising_subB(Qureg qureg, int ketQubit, qreal pro #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 2; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); qindex recvInd = getBufferRecvInd(); int braBit = util_getRankBitOfBraQubit(ketQubit, qureg); @@ -1154,7 +1178,8 @@ void gpu_densmatr_twoQubitDepolarising_subA(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); int braQb1 = util_getBraQubit(ketQb1, qureg); int braQb2 = util_getBraQubit(ketQb2, qureg); @@ -1176,7 +1201,8 @@ void gpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 16; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); int braQb1 = util_getBraQubit(ketQb1, qureg); int braQb2 = util_getBraQubit(ketQb2, qureg); @@ -1201,7 +1227,8 @@ void gpu_densmatr_twoQubitDepolarising_subC(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); int braQb1 = util_getBraQubit(ketQb1, qureg); int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg); @@ -1223,7 +1250,8 @@ void gpu_densmatr_twoQubitDepolarising_subD(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 8; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); qindex offset = getBufferRecvInd(); int braQb1 = util_getBraQubit(ketQb1, qureg); @@ -1246,7 +1274,8 @@ void gpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg); int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg); @@ -1271,7 +1300,8 @@ void gpu_densmatr_twoQubitDepolarising_subF(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 4; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); qindex offset = getBufferRecvInd(); int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg); @@ -1300,7 +1330,8 @@ void gpu_densmatr_oneQubitPauliChannel_subA(Qureg qureg, int ketQubit, qreal pI, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 4; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); int braQubit = util_getBraQubit(ketQubit, qureg); auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ); @@ -1321,7 +1352,8 @@ void gpu_densmatr_oneQubitPauliChannel_subB(Qureg qureg, int ketQubit, qreal pI, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 2; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); qindex recvInd = getBufferRecvInd(); int braBit = util_getRankBitOfBraQubit(ketQubit, qureg); @@ -1349,7 +1381,8 @@ void gpu_densmatr_oneQubitDamping_subA(Qureg qureg, int ketQubit, qreal prob) { #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 4; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); int braQubit = util_getBraQubit(ketQubit, qureg); auto factors = util_getOneQubitDampingFactors(prob); @@ -1370,7 +1403,8 @@ void gpu_densmatr_oneQubitDamping_subB(Qureg qureg, int qubit, qreal prob) { #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 2; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); auto c2 = util_getOneQubitDampingFactors(prob).c2; @@ -1389,7 +1423,8 @@ void gpu_densmatr_oneQubitDamping_subC(Qureg qureg, int ketQubit, qreal prob) { #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 2; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); auto braBit = util_getRankBitOfBraQubit(ketQubit, qureg); auto c1 = util_getOneQubitDampingFactors(prob).c1; @@ -1409,7 +1444,8 @@ void gpu_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob) { #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 2; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); qindex recvInd = getBufferRecvInd(); kernel_densmatr_oneQubitDamping_subD <<>> ( @@ -1437,7 +1473,8 @@ void gpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector ta #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = outQureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); devints devTargs = targs; devints devPairTargs = pairTargs; @@ -1557,7 +1594,8 @@ void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu #if COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); // allocate exponentially-big temporary memory (error if failed) devints devQubits = qubits; @@ -1591,7 +1629,8 @@ void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu // we decouple numColsPerNode and numThreads for clarity // (and in case parallelisation granularity ever changes); qindex numThreads = powerOf2(qureg.logNumColsPerNode); - qindex numBlocks = getNumBlocks(numThreads); + const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg); qindex numAmpsPerCol = powerOf2(qureg.numQubits); From 680fdda0e53b9dea14e402a9cbf8cb4337ef181a Mon Sep 17 00:00:00 2001 From: Oliver Thomson Brown Date: Mon, 4 May 2026 14:06:08 +0100 Subject: [PATCH 3/3] updated var names to match QuEST style --- quest/src/gpu/gpu_config.cpp | 4 +- quest/src/gpu/gpu_config.hpp | 2 +- quest/src/gpu/gpu_kernels.cuh | 4 +- quest/src/gpu/gpu_subroutines.cpp | 244 +++++++++++++++--------------- 4 files changed, 127 insertions(+), 127 deletions(-) diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp index 78ef1a41..588779c4 100644 --- a/quest/src/gpu/gpu_config.cpp +++ b/quest/src/gpu/gpu_config.cpp @@ -340,9 +340,9 @@ int gpu_getNumThreadsPerBlock() { #endif } -void gpu_setNumThreadsPerBlock(const int NEW_TPB) { +void gpu_setNumThreadsPerBlock(const int newThreadsPerBlock) { #if COMPILE_CUDA - numThreadsPerBlock = NEW_TPB; + numThreadsPerBlock = newThreadsPerBlock; #else error_gpuQueriedButGpuNotCompiled(); #endif diff --git a/quest/src/gpu/gpu_config.hpp b/quest/src/gpu/gpu_config.hpp index 866475cc..0787e127 100644 --- a/quest/src/gpu/gpu_config.hpp +++ b/quest/src/gpu/gpu_config.hpp @@ -66,7 +66,7 @@ qindex gpu_getMaxNumConcurrentThreads(); int gpu_getNumThreadsPerBlock(); -void gpu_setNumThreadsPerBlock(const int NEW_TPB); +void gpu_setNumThreadsPerBlock(const int newThreadsPerBlock); void gpu_bindLocalGPUsToNodes(); diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index 7459235d..540a409f 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -51,14 +51,14 @@ __forceinline__ __device__ qindex getThreadInd() { } -__host__ qindex getNumBlocks(qindex numThreads, const int NUM_THREADS_PER_BLOCK) { +__host__ qindex getNumBlocks(qindex numThreads, const int numThreadsPerBlock) { /// @todo /// improve this with cudaOccupancyMaxPotentialBlockSize(), /// making it function specific // CUDA ceil - return ceil(numThreads / static_cast(NUM_THREADS_PER_BLOCK)); + return ceil(numThreads / static_cast(numThreadsPerBlock)); } diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index 56b855c4..a75c44cc 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -140,14 +140,14 @@ qindex gpu_statevec_packAmpsIntoBuffer(Qureg qureg, vector qubits, vector <<>> ( + kernel_statevec_packAmpsIntoBuffer <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[sendInd], numThreads, getPtr(sortedQubits), qubits.size(), qubitStateMask ); @@ -169,11 +169,11 @@ qindex gpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 8; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex sendInd = getSubBufferSendInd(qureg); - kernel_statevec_packPairSummedAmpsIntoBuffer <<>> ( + kernel_statevec_packPairSummedAmpsIntoBuffer <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[sendInd], numThreads, qubit1, qubit2, qubit3, bit2 ); @@ -209,13 +209,13 @@ void gpu_statevec_anyCtrlSwap_subA(Qureg qureg, vector ctrls, vector c #elif COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode / powerOf2(2 + ctrls.size()); - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints sortedQubits = util_getSorted(ctrls, {targ2, targ1}); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ2, targ1}, {0, 1}); - kernel_statevec_anyCtrlSwap_subA <<>> ( + kernel_statevec_anyCtrlSwap_subA <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2 ); @@ -234,14 +234,14 @@ void gpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector ctrls, vector c #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); devints sortedCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); - kernel_statevec_anyCtrlSwap_subB <<>> ( + kernel_statevec_anyCtrlSwap_subB <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask ); @@ -260,14 +260,14 @@ void gpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector ctrls, vector c #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / powerOf2(1 + ctrls.size()); - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); devints sortedQubits = util_getSorted(ctrls, {targ}); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {targState}); - kernel_statevec_anyCtrlSwap_subC <<>> ( + kernel_statevec_anyCtrlSwap_subC <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask ); @@ -303,15 +303,15 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v #elif COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 1); - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints sortedQubits = util_getSorted(ctrls, {targ}); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0}); auto [m00, m01, m10, m11] = unpackMatrixToCuQcomps(matr); - kernel_statevec_anyCtrlOneTargDenseMatr_subA <<>> ( + kernel_statevec_anyCtrlOneTargDenseMatr_subA <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ, m00, m01, m10, m11 @@ -331,14 +331,14 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector ctrls, v #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); devints sortedCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); - kernel_statevec_anyCtrlOneTargDenseMatr_subB <<>> ( + kernel_statevec_anyCtrlOneTargDenseMatr_subB <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, toCuQcomp(fac0), toCuQcomp(fac1) @@ -374,8 +374,8 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve #elif COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 2); - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints sortedQubits = util_getSorted(ctrls, {targ1,targ2}); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1,targ2}, {0,0}); @@ -383,7 +383,7 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve // unpack matrix elems which are more efficiently accessed by kernels as args than shared mem (... maybe...) auto m = unpackMatrixToCuQcomps(matr); - kernel_statevec_anyCtrlTwoTargDenseMatr_sub <<>> ( + kernel_statevec_anyCtrlTwoTargDenseMatr_sub <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2, m[0], m[1], m[2], m[3], m[4], m[5], m[6], m[7], @@ -460,7 +460,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve if constexpr (NumTargs != -1) { // when NumTargs <= 5, each thread has a private array stored in the registers, - // enabling rapid IO. Given NUM_THREADS_PER_BLOCK = 128, the maximum size of + // enabling rapid IO. Given numThreadsPerBlock = 128, the maximum size of // this array per-block is 16 * 128 * 2^5 B = 64 KiB which exceeds shared // memory capacity, but does NOT exceed maximum register capacity. @@ -470,12 +470,12 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve /// global memory) and greatly sabotage performance on some GPUs. qindex numThreads = numBatches; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); kernel_statevec_anyCtrlFewTargDenseMatr - <<>> ( + <<>> ( ampsPtr, numThreads, qubitsPtr, nCtrls, qubitStateMask, targsPtr, matrPtr @@ -494,7 +494,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve // where we assign one-block per multiprocessor because we are anyway memory- // bandwidth bound (so we don't expect many interweaved blocks per MP). qindex numThreads = gpu_getMaxNumConcurrentThreads(); - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); // use strictly 2^# threads to maintain precondition of all kernels if (!isPowerOf2(numThreads)) @@ -506,15 +506,15 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve // evenly distribute the batches between threads, and the threads unevenly between blocks qindex numBatchesPerThread = numBatches / numThreads; // divides evenly - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); // expand the cache if necessary - qindex numKernelInvocations = numBlocks * NUM_THREADS_PER_BLOCK; + qindex numKernelInvocations = numBlocks * numThreadsPerBlock; qcomp* cache = gpu_getCacheOfSize(powerOf2(targs.size()), numKernelInvocations); kernel_statevec_anyCtrlManyTargDenseMatr - <<>> ( + <<>> ( toCuQcomps(cache), ampsPtr, numThreads, numBatchesPerThread, qubitsPtr, nCtrls, qubitStateMask, @@ -575,14 +575,14 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector ctrls, vec /// efficient (because of improved parallelisation granularity) qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints deviceCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); auto elems = unpackMatrixToCuQcomps(matr); - kernel_statevec_anyCtrlOneTargDiagMatr_sub <<>> ( + kernel_statevec_anyCtrlOneTargDiagMatr_sub <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ, elems[0], elems[1] ); @@ -644,14 +644,14 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector ctrls, vec /// efficient (because of improved parallelisation granularity) qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints deviceCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); auto elems = unpackMatrixToCuQcomps(matr); - kernel_statevec_anyCtrlTwoTargDiagMatr_sub <<>> ( + kernel_statevec_anyCtrlTwoTargDiagMatr_sub <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ1, targ2, elems[0], elems[1], elems[2], elems[3] @@ -713,14 +713,14 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec /// efficient (because of improved parallelisation granularity) qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints deviceTargs = targs; devints deviceCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); - kernel_statevec_anyCtrlAnyTargDiagMatr_sub <<>> ( + kernel_statevec_anyCtrlAnyTargDiagMatr_sub <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, getPtr(deviceTargs), targs.size(), toCuQcomps(util_getGpuMemPtr(matr)), toCuQcomp(exponent) @@ -771,12 +771,12 @@ void gpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); kernel_densmatr_allTargDiagMatr_sub - <<>> ( + <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, toCuQcomps(util_getGpuMemPtr(matr)), matr.numElems, toCuQcomp(exponent) ); @@ -834,9 +834,9 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, vector ct // faster than when giving threads many pair-amps to modify, due to memory movements qindex numThreads = (qureg.numAmpsPerNode / powerOf2(ctrls.size())) / 2; // divides evenly - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); - kernel_statevector_anyCtrlPauliTensorOrGadget_subA <<>> ( + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); + kernel_statevector_anyCtrlPauliTensorOrGadget_subA <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, getPtr(deviceQubits), ctrls.size(), qubitStateMask, getPtr(deviceTargs), deviceTargs.size(), @@ -857,8 +857,8 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector ct #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); qcomp powI = util_getPowerOfI(y.size()); @@ -868,7 +868,7 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector ct devints sortedCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); - kernel_statevector_anyCtrlPauliTensorOrGadget_subB <<>> ( + kernel_statevector_anyCtrlPauliTensorOrGadget_subB <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, maskXY, maskYZ, bufferMaskXY, @@ -899,14 +899,14 @@ void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, vector c #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints sortedCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); qindex targMask = util_getBitMask(targs); - kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub <<>> ( + kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, targMask, toCuQcomp(fac0), toCuQcomp(fac1) @@ -933,8 +933,8 @@ void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector coeffs #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = outQureg.numAmpsPerNode; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); // extract amp ptrs from qureg list vector ptrs; @@ -946,7 +946,7 @@ void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector coeffs devcuqcompptrs devQuregAmps = ptrs; devcomps devCoeffs = coeffs; - kernel_statevec_setQuregToWeightedSum_sub <<>> ( + kernel_statevec_setQuregToWeightedSum_sub <<>> ( toCuQcomps(outQureg.gpuAmps), numThreads, getPtr(devCoeffs), getPtr(devQuregAmps), inQuregs.size() ); @@ -974,10 +974,10 @@ void gpu_densmatr_mixQureg_subB(qreal outProb, Qureg outQureg, qreal inProb, Qur #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = outQureg.numAmpsPerNode; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); - kernel_densmatr_mixQureg_subB <<>> ( + kernel_densmatr_mixQureg_subB <<>> ( outProb, toCuQcomps(outQureg.gpuAmps), inProb, toCuQcomps(inQureg.gpuAmps), numThreads, inQureg.numAmps ); @@ -993,10 +993,10 @@ void gpu_densmatr_mixQureg_subC(qreal outProb, Qureg outQureg, qreal inProb) { #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = outQureg.numAmpsPerNode; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); - kernel_densmatr_mixQureg_subC <<>> ( + kernel_densmatr_mixQureg_subC <<>> ( outProb, toCuQcomps(outQureg.gpuAmps), inProb, toCuQcomps(outQureg.gpuCommBuffer), numThreads, outQureg.rank, powerOf2(outQureg.numQubits), outQureg.logNumAmpsPerNode ); @@ -1026,13 +1026,13 @@ void gpu_densmatr_oneQubitDephasing_subA(Qureg qureg, int ketQubit, qreal prob) #elif COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode / 4; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); auto fac = util_getOneQubitDephasingFactor(prob); int braQubit = util_getBraQubit(ketQubit, qureg); - kernel_densmatr_oneQubitDephasing_subA <<>> ( + kernel_densmatr_oneQubitDephasing_subA <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, fac ); @@ -1053,13 +1053,13 @@ void gpu_densmatr_oneQubitDephasing_subB(Qureg qureg, int ketQubit, qreal prob) #elif COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode / 2; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); auto fac = util_getOneQubitDephasingFactor(prob); int braBit = util_getRankBitOfBraQubit(ketQubit, qureg); - kernel_densmatr_oneQubitDephasing_subB <<>> ( + kernel_densmatr_oneQubitDephasing_subB <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braBit, fac ); @@ -1099,14 +1099,14 @@ void gpu_densmatr_twoQubitDephasing_subB(Qureg qureg, int ketQubitA, int ketQubi #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); auto term = util_getTwoQubitDephasingTerm(prob); int braQubitA = util_getBraQubit(ketQubitA, qureg); int braQubitB = util_getBraQubit(ketQubitB, qureg); - kernel_densmatr_twoQubitDephasing_subB <<>> ( + kernel_densmatr_twoQubitDephasing_subB <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, // numAmps, not numCols ketQubitA, ketQubitB, braQubitA, braQubitB, term ); @@ -1128,13 +1128,13 @@ void gpu_densmatr_oneQubitDepolarising_subA(Qureg qureg, int ketQubit, qreal pro #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 4; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); int braQubit = util_getBraQubit(ketQubit, qureg); auto factors = util_getOneQubitDepolarisingFactors(prob); - kernel_densmatr_oneQubitDepolarising_subA <<>> ( + kernel_densmatr_oneQubitDepolarising_subA <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, factors.c1, factors.c2, factors.c3 ); @@ -1149,14 +1149,14 @@ void gpu_densmatr_oneQubitDepolarising_subB(Qureg qureg, int ketQubit, qreal pro #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 2; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); int braBit = util_getRankBitOfBraQubit(ketQubit, qureg); auto factors = util_getOneQubitDepolarisingFactors(prob); - kernel_densmatr_oneQubitDepolarising_subB <<>> ( + kernel_densmatr_oneQubitDepolarising_subB <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, ketQubit, braBit, factors.c1, factors.c2, factors.c3 ); @@ -1178,14 +1178,14 @@ void gpu_densmatr_twoQubitDepolarising_subA(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); int braQb1 = util_getBraQubit(ketQb1, qureg); int braQb2 = util_getBraQubit(ketQb2, qureg); auto c3 = util_getTwoQubitDepolarisingFactors(prob).c3; - kernel_densmatr_twoQubitDepolarising_subA <<>> ( + kernel_densmatr_twoQubitDepolarising_subA <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braQb1, braQb2, c3 ); @@ -1201,8 +1201,8 @@ void gpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 16; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); int braQb1 = util_getBraQubit(ketQb1, qureg); int braQb2 = util_getBraQubit(ketQb2, qureg); @@ -1211,7 +1211,7 @@ void gpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2, // each kernel invocation sums all 4 amps together, so adjusts c1 qreal altc1 = factors.c1 - factors.c2; - kernel_densmatr_twoQubitDepolarising_subB <<>> ( + kernel_densmatr_twoQubitDepolarising_subB <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braQb1, braQb2, altc1, factors.c2 ); @@ -1227,14 +1227,14 @@ void gpu_densmatr_twoQubitDepolarising_subC(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); int braQb1 = util_getBraQubit(ketQb1, qureg); int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg); auto c3 = util_getTwoQubitDepolarisingFactors(prob).c3; - kernel_densmatr_twoQubitDepolarising_subC <<>> ( + kernel_densmatr_twoQubitDepolarising_subC <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braQb1, braBit2, c3 ); @@ -1250,15 +1250,15 @@ void gpu_densmatr_twoQubitDepolarising_subD(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 8; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex offset = getBufferRecvInd(); int braQb1 = util_getBraQubit(ketQb1, qureg); int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg); auto factors = util_getTwoQubitDepolarisingFactors(prob); - kernel_densmatr_twoQubitDepolarising_subD <<>> ( + kernel_densmatr_twoQubitDepolarising_subD <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[offset], numThreads, ketQb1, ketQb2, braQb1, braBit2, factors.c1, factors.c2 ); @@ -1274,8 +1274,8 @@ void gpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg); int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg); @@ -1284,7 +1284,7 @@ void gpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2, qreal fac0 = 1 + factors.c3; qreal fac1 = factors.c1 - fac0; - kernel_densmatr_twoQubitDepolarising_subE <<>> ( + kernel_densmatr_twoQubitDepolarising_subE <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braBit1, braBit2, fac0, fac1 ); @@ -1300,15 +1300,15 @@ void gpu_densmatr_twoQubitDepolarising_subF(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 4; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex offset = getBufferRecvInd(); int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg); int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg); auto c2 = util_getTwoQubitDepolarisingFactors(prob).c2; - kernel_densmatr_twoQubitDepolarising_subF <<>> ( + kernel_densmatr_twoQubitDepolarising_subF <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[offset], numThreads, ketQb1, ketQb2, braBit1, braBit2, c2 ); @@ -1330,13 +1330,13 @@ void gpu_densmatr_oneQubitPauliChannel_subA(Qureg qureg, int ketQubit, qreal pI, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 4; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); int braQubit = util_getBraQubit(ketQubit, qureg); auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ); - kernel_densmatr_oneQubitPauliChannel_subA <<>> ( + kernel_densmatr_oneQubitPauliChannel_subA <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, factors.c1, factors.c2, factors.c3, factors.c4 ); @@ -1352,14 +1352,14 @@ void gpu_densmatr_oneQubitPauliChannel_subB(Qureg qureg, int ketQubit, qreal pI, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 2; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); int braBit = util_getRankBitOfBraQubit(ketQubit, qureg); auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ); - kernel_densmatr_oneQubitPauliChannel_subB <<>> ( + kernel_densmatr_oneQubitPauliChannel_subB <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, ketQubit, braBit, factors.c1, factors.c2, factors.c3, factors.c4 ); @@ -1381,13 +1381,13 @@ void gpu_densmatr_oneQubitDamping_subA(Qureg qureg, int ketQubit, qreal prob) { #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 4; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); int braQubit = util_getBraQubit(ketQubit, qureg); auto factors = util_getOneQubitDampingFactors(prob); - kernel_densmatr_oneQubitDamping_subA <<>> ( + kernel_densmatr_oneQubitDamping_subA <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, prob, factors.c1, factors.c2 ); @@ -1403,12 +1403,12 @@ void gpu_densmatr_oneQubitDamping_subB(Qureg qureg, int qubit, qreal prob) { #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 2; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); auto c2 = util_getOneQubitDampingFactors(prob).c2; - kernel_densmatr_oneQubitDamping_subB <<>> ( + kernel_densmatr_oneQubitDamping_subB <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, qubit, c2 ); @@ -1423,13 +1423,13 @@ void gpu_densmatr_oneQubitDamping_subC(Qureg qureg, int ketQubit, qreal prob) { #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 2; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); auto braBit = util_getRankBitOfBraQubit(ketQubit, qureg); auto c1 = util_getOneQubitDampingFactors(prob).c1; - kernel_densmatr_oneQubitDamping_subC <<>> ( + kernel_densmatr_oneQubitDamping_subC <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braBit, c1 ); @@ -1444,11 +1444,11 @@ void gpu_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob) { #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 2; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); - kernel_densmatr_oneQubitDamping_subD <<>> ( + kernel_densmatr_oneQubitDamping_subD <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, qubit, prob ); @@ -1473,14 +1473,14 @@ void gpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector ta #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = outQureg.numAmpsPerNode; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints devTargs = targs; devints devPairTargs = pairTargs; devints devAllTargs = util_getSorted(targs, pairTargs); - kernel_densmatr_partialTrace_sub <<>> ( + kernel_densmatr_partialTrace_sub <<>> ( toCuQcomps(inQureg.gpuAmps), toCuQcomps(outQureg.gpuAmps), numThreads, getPtr(devTargs), getPtr(devPairTargs), getPtr(devAllTargs), targs.size() ); @@ -1594,14 +1594,14 @@ void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu #if COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode; - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); // allocate exponentially-big temporary memory (error if failed) devints devQubits = qubits; devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws - kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( + kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( getPtr(devProbs), toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(devQubits), devQubits.size() ); @@ -1629,8 +1629,8 @@ void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu // we decouple numColsPerNode and numThreads for clarity // (and in case parallelisation granularity ever changes); qindex numThreads = powerOf2(qureg.logNumColsPerNode); - const int NUM_THREADS_PER_BLOCK = gpu_getNumThreadsPerBlock(); - qindex numBlocks = getNumBlocks(numThreads, NUM_THREADS_PER_BLOCK); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg); qindex numAmpsPerCol = powerOf2(qureg.numQubits); @@ -1639,7 +1639,7 @@ void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu devints devQubits = qubits; devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws - kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( + kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( getPtr(devProbs), toCuQcomps(qureg.gpuAmps), numThreads, firstDiagInd, numAmpsPerCol, qureg.rank, qureg.logNumAmpsPerNode,