diff --git a/quest/include/environment.h b/quest/include/environment.h index 04f24bfe..a6724828 100644 --- a/quest/include/environment.h +++ b/quest/include/environment.h @@ -83,6 +83,14 @@ int isQuESTEnvInit(); QuESTEnv getQuESTEnv(); +/** @notyetdoced + * GPU thread per block control + * This is somehow probably the best pre-existing place for this. It only really applies to GPU, because for + * OpenMP the user can just export OMP_NUM_THREADS or call omp_set_num_threads. + */ +int getQuESTGpuThreadsPerBlock(); +void setQuESTGpuThreadsPerBlock(const int NEW_TPB); + // end de-mangler #ifdef __cplusplus diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp index 54149189..1f36ee64 100644 --- a/quest/src/api/environment.cpp +++ b/quest/src/api/environment.cpp @@ -509,5 +509,16 @@ void getEnvironmentString(char str[200]) { } +int getQuESTGpuThreadsPerBlock() { + QuESTEnv env = getQuESTEnv(); + return env.isGpuAccelerated? gpu_getNumThreadsPerBlock() : 0; +} + +void setQuESTGpuThreadsPerBlock(const int NEW_TPB) { + // just rely on the internal function to throw an error if there's no GPU support compiled + gpu_setNumThreadsPerBlock(NEW_TPB); + return; +} + // end de-mangler } diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp index c11ec224..ad8e303a 100644 --- a/quest/src/cpu/cpu_config.cpp +++ b/quest/src/cpu/cpu_config.cpp @@ -79,9 +79,7 @@ int cpu_getAvailableNumThreads() { #if COMPILE_OPENMP int n = -1; - #pragma omp parallel shared(n) - #pragma omp single - n = omp_get_num_threads(); + n = omp_get_max_threads(); return n; #else diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp index c7db834b..588779c4 100644 --- a/quest/src/gpu/gpu_config.cpp +++ b/quest/src/gpu/gpu_config.cpp @@ -41,6 +41,7 @@ #include "quest/src/gpu/cuda_to_hip.hpp" #endif +int numThreadsPerBlock = 128; /* @@ -330,6 +331,24 @@ qindex gpu_getMaxNumConcurrentThreads() { * ENVIRONMENT MANAGEMENT */ +int gpu_getNumThreadsPerBlock() { +#if COMPILE_CUDA + return numThreadsPerBlock; +#else + error_gpuQueriedButGpuNotCompiled(); + return -1; +#endif +} + +void gpu_setNumThreadsPerBlock(const int newThreadsPerBlock) { +#if COMPILE_CUDA + numThreadsPerBlock = newThreadsPerBlock; +#else + error_gpuQueriedButGpuNotCompiled(); +#endif + return; +} + std::array getBoundGpuUuid() { #if COMPILE_CUDA diff --git a/quest/src/gpu/gpu_config.hpp b/quest/src/gpu/gpu_config.hpp index 1b3be629..0787e127 100644 --- a/quest/src/gpu/gpu_config.hpp +++ b/quest/src/gpu/gpu_config.hpp @@ -19,7 +19,6 @@ #include "quest/include/channels.h" - /* * CUDA ERROR HANDLING */ @@ -65,6 +64,10 @@ qindex gpu_getMaxNumConcurrentThreads(); * ENVIRONMENT MANAGEMENT */ +int gpu_getNumThreadsPerBlock(); + +void gpu_setNumThreadsPerBlock(const int newThreadsPerBlock); + void gpu_bindLocalGPUsToNodes(); bool gpu_areAnyNodesBoundToSameGpu(); @@ -76,7 +79,6 @@ void gpu_initCuQuantum(); void gpu_finalizeCuQuantum(); - /* * MEMORY MANAGEMENT */ @@ -122,4 +124,4 @@ size_t gpu_getCacheMemoryInBytes(); -#endif // GPU_CONFIG_HPP \ No newline at end of file +#endif // GPU_CONFIG_HPP diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index 4f2a737e..540a409f 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -46,23 +46,19 @@ * THREAD MANAGEMENT */ - -const int NUM_THREADS_PER_BLOCK = 128; - - __forceinline__ __device__ qindex getThreadInd() { return blockIdx.x*blockDim.x + threadIdx.x; } -__host__ qindex getNumBlocks(qindex numThreads) { +__host__ qindex getNumBlocks(qindex numThreads, const int numThreadsPerBlock) { /// @todo /// improve this with cudaOccupancyMaxPotentialBlockSize(), /// making it function specific // CUDA ceil - return ceil(numThreads / static_cast(NUM_THREADS_PER_BLOCK)); + return ceil(numThreads / static_cast(numThreadsPerBlock)); } diff --git a/quest/src/gpu/gpu_subroutines.cpp b/quest/src/gpu/gpu_subroutines.cpp index 5e18048f..a75c44cc 100644 --- a/quest/src/gpu/gpu_subroutines.cpp +++ b/quest/src/gpu/gpu_subroutines.cpp @@ -66,7 +66,6 @@ using std::vector; - /* * GETTERS */ @@ -141,13 +140,14 @@ qindex gpu_statevec_packAmpsIntoBuffer(Qureg qureg, vector qubits, vector <<>> ( + kernel_statevec_packAmpsIntoBuffer <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[sendInd], numThreads, getPtr(sortedQubits), qubits.size(), qubitStateMask ); @@ -169,10 +169,11 @@ qindex gpu_statevec_packPairSummedAmpsIntoBuffer(Qureg qureg, int qubit1, int qu #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 8; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex sendInd = getSubBufferSendInd(qureg); - kernel_statevec_packPairSummedAmpsIntoBuffer <<>> ( + kernel_statevec_packPairSummedAmpsIntoBuffer <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[sendInd], numThreads, qubit1, qubit2, qubit3, bit2 ); @@ -208,12 +209,13 @@ void gpu_statevec_anyCtrlSwap_subA(Qureg qureg, vector ctrls, vector c #elif COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode / powerOf2(2 + ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints sortedQubits = util_getSorted(ctrls, {targ2, targ1}); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ2, targ1}, {0, 1}); - kernel_statevec_anyCtrlSwap_subA <<>> ( + kernel_statevec_anyCtrlSwap_subA <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2 ); @@ -232,13 +234,14 @@ void gpu_statevec_anyCtrlSwap_subB(Qureg qureg, vector ctrls, vector c #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); devints sortedCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); - kernel_statevec_anyCtrlSwap_subB <<>> ( + kernel_statevec_anyCtrlSwap_subB <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask ); @@ -257,13 +260,14 @@ void gpu_statevec_anyCtrlSwap_subC(Qureg qureg, vector ctrls, vector c #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / powerOf2(1 + ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); devints sortedQubits = util_getSorted(ctrls, {targ}); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {targState}); - kernel_statevec_anyCtrlSwap_subC <<>> ( + kernel_statevec_anyCtrlSwap_subC <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask ); @@ -299,14 +303,15 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subA(Qureg qureg, vector ctrls, v #elif COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 1); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints sortedQubits = util_getSorted(ctrls, {targ}); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ}, {0}); auto [m00, m01, m10, m11] = unpackMatrixToCuQcomps(matr); - kernel_statevec_anyCtrlOneTargDenseMatr_subA <<>> ( + kernel_statevec_anyCtrlOneTargDenseMatr_subA <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ, m00, m01, m10, m11 @@ -326,13 +331,14 @@ void gpu_statevec_anyCtrlOneTargDenseMatr_subB(Qureg qureg, vector ctrls, v #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); devints sortedCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); - kernel_statevec_anyCtrlOneTargDenseMatr_subB <<>> ( + kernel_statevec_anyCtrlOneTargDenseMatr_subB <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, toCuQcomp(fac0), toCuQcomp(fac1) @@ -368,7 +374,8 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve #elif COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size() + 2); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints sortedQubits = util_getSorted(ctrls, {targ1,targ2}); qindex qubitStateMask = util_getBitMask(ctrls, ctrlStates, {targ1,targ2}, {0,0}); @@ -376,7 +383,7 @@ void gpu_statevec_anyCtrlTwoTargDenseMatr_sub(Qureg qureg, vector ctrls, ve // unpack matrix elems which are more efficiently accessed by kernels as args than shared mem (... maybe...) auto m = unpackMatrixToCuQcomps(matr); - kernel_statevec_anyCtrlTwoTargDenseMatr_sub <<>> ( + kernel_statevec_anyCtrlTwoTargDenseMatr_sub <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, getPtr(sortedQubits), ctrls.size(), qubitStateMask, targ1, targ2, m[0], m[1], m[2], m[3], m[4], m[5], m[6], m[7], @@ -453,7 +460,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve if constexpr (NumTargs != -1) { // when NumTargs <= 5, each thread has a private array stored in the registers, - // enabling rapid IO. Given NUM_THREADS_PER_BLOCK = 128, the maximum size of + // enabling rapid IO. Given numThreadsPerBlock = 128, the maximum size of // this array per-block is 16 * 128 * 2^5 B = 64 KiB which exceeds shared // memory capacity, but does NOT exceed maximum register capacity. @@ -463,11 +470,12 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve /// global memory) and greatly sabotage performance on some GPUs. qindex numThreads = numBatches; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); kernel_statevec_anyCtrlFewTargDenseMatr - <<>> ( + <<>> ( ampsPtr, numThreads, qubitsPtr, nCtrls, qubitStateMask, targsPtr, matrPtr @@ -486,6 +494,7 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve // where we assign one-block per multiprocessor because we are anyway memory- // bandwidth bound (so we don't expect many interweaved blocks per MP). qindex numThreads = gpu_getMaxNumConcurrentThreads(); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); // use strictly 2^# threads to maintain precondition of all kernels if (!isPowerOf2(numThreads)) @@ -497,15 +506,15 @@ void gpu_statevec_anyCtrlAnyTargDenseMatr_sub(Qureg qureg, vector ctrls, ve // evenly distribute the batches between threads, and the threads unevenly between blocks qindex numBatchesPerThread = numBatches / numThreads; // divides evenly - qindex numBlocks = getNumBlocks(numThreads); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); // expand the cache if necessary - qindex numKernelInvocations = numBlocks * NUM_THREADS_PER_BLOCK; + qindex numKernelInvocations = numBlocks * numThreadsPerBlock; qcomp* cache = gpu_getCacheOfSize(powerOf2(targs.size()), numKernelInvocations); kernel_statevec_anyCtrlManyTargDenseMatr - <<>> ( + <<>> ( toCuQcomps(cache), ampsPtr, numThreads, numBatchesPerThread, qubitsPtr, nCtrls, qubitStateMask, @@ -566,13 +575,14 @@ void gpu_statevec_anyCtrlOneTargDiagMatr_sub(Qureg qureg, vector ctrls, vec /// efficient (because of improved parallelisation granularity) qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints deviceCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); auto elems = unpackMatrixToCuQcomps(matr); - kernel_statevec_anyCtrlOneTargDiagMatr_sub <<>> ( + kernel_statevec_anyCtrlOneTargDiagMatr_sub <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ, elems[0], elems[1] ); @@ -634,13 +644,14 @@ void gpu_statevec_anyCtrlTwoTargDiagMatr_sub(Qureg qureg, vector ctrls, vec /// efficient (because of improved parallelisation granularity) qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints deviceCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); auto elems = unpackMatrixToCuQcomps(matr); - kernel_statevec_anyCtrlTwoTargDiagMatr_sub <<>> ( + kernel_statevec_anyCtrlTwoTargDiagMatr_sub <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, targ1, targ2, elems[0], elems[1], elems[2], elems[3] @@ -702,13 +713,14 @@ void gpu_statevec_anyCtrlAnyTargDiagMatr_sub(Qureg qureg, vector ctrls, vec /// efficient (because of improved parallelisation granularity) qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints deviceTargs = targs; devints deviceCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); - kernel_statevec_anyCtrlAnyTargDiagMatr_sub <<>> ( + kernel_statevec_anyCtrlAnyTargDiagMatr_sub <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(deviceCtrls), ctrls.size(), ctrlStateMask, getPtr(deviceTargs), targs.size(), toCuQcomps(util_getGpuMemPtr(matr)), toCuQcomp(exponent) @@ -759,11 +771,12 @@ void gpu_densmatr_allTargDiagMatr_sub(Qureg qureg, FullStateDiagMatr matr, qcomp #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); kernel_densmatr_allTargDiagMatr_sub - <<>> ( + <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, toCuQcomps(util_getGpuMemPtr(matr)), matr.numElems, toCuQcomp(exponent) ); @@ -821,8 +834,9 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subA(Qureg qureg, vector ct // faster than when giving threads many pair-amps to modify, due to memory movements qindex numThreads = (qureg.numAmpsPerNode / powerOf2(ctrls.size())) / 2; // divides evenly - qindex numBlocks = getNumBlocks(numThreads); - kernel_statevector_anyCtrlPauliTensorOrGadget_subA <<>> ( + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); + kernel_statevector_anyCtrlPauliTensorOrGadget_subA <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, getPtr(deviceQubits), ctrls.size(), qubitStateMask, getPtr(deviceTargs), deviceTargs.size(), @@ -843,7 +857,8 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector ct #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); qcomp powI = util_getPowerOfI(y.size()); @@ -853,7 +868,7 @@ void gpu_statevector_anyCtrlPauliTensorOrGadget_subB(Qureg qureg, vector ct devints sortedCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); - kernel_statevector_anyCtrlPauliTensorOrGadget_subB <<>> ( + kernel_statevector_anyCtrlPauliTensorOrGadget_subB <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, maskXY, maskYZ, bufferMaskXY, @@ -884,13 +899,14 @@ void gpu_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(Qureg qureg, vector c #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / powerOf2(ctrls.size()); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints sortedCtrls = util_getSorted(ctrls); qindex ctrlStateMask = util_getBitMask(ctrls, ctrlStates); qindex targMask = util_getBitMask(targs); - kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub <<>> ( + kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, getPtr(sortedCtrls), ctrls.size(), ctrlStateMask, targMask, toCuQcomp(fac0), toCuQcomp(fac1) @@ -917,7 +933,8 @@ void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector coeffs #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = outQureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); // extract amp ptrs from qureg list vector ptrs; @@ -929,7 +946,7 @@ void gpu_statevec_setQuregToWeightedSum_sub(Qureg outQureg, vector coeffs devcuqcompptrs devQuregAmps = ptrs; devcomps devCoeffs = coeffs; - kernel_statevec_setQuregToWeightedSum_sub <<>> ( + kernel_statevec_setQuregToWeightedSum_sub <<>> ( toCuQcomps(outQureg.gpuAmps), numThreads, getPtr(devCoeffs), getPtr(devQuregAmps), inQuregs.size() ); @@ -957,9 +974,10 @@ void gpu_densmatr_mixQureg_subB(qreal outProb, Qureg outQureg, qreal inProb, Qur #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = outQureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); - kernel_densmatr_mixQureg_subB <<>> ( + kernel_densmatr_mixQureg_subB <<>> ( outProb, toCuQcomps(outQureg.gpuAmps), inProb, toCuQcomps(inQureg.gpuAmps), numThreads, inQureg.numAmps ); @@ -975,9 +993,10 @@ void gpu_densmatr_mixQureg_subC(qreal outProb, Qureg outQureg, qreal inProb) { #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = outQureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); - kernel_densmatr_mixQureg_subC <<>> ( + kernel_densmatr_mixQureg_subC <<>> ( outProb, toCuQcomps(outQureg.gpuAmps), inProb, toCuQcomps(outQureg.gpuCommBuffer), numThreads, outQureg.rank, powerOf2(outQureg.numQubits), outQureg.logNumAmpsPerNode ); @@ -1007,12 +1026,13 @@ void gpu_densmatr_oneQubitDephasing_subA(Qureg qureg, int ketQubit, qreal prob) #elif COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode / 4; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); auto fac = util_getOneQubitDephasingFactor(prob); int braQubit = util_getBraQubit(ketQubit, qureg); - kernel_densmatr_oneQubitDephasing_subA <<>> ( + kernel_densmatr_oneQubitDephasing_subA <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, fac ); @@ -1033,12 +1053,13 @@ void gpu_densmatr_oneQubitDephasing_subB(Qureg qureg, int ketQubit, qreal prob) #elif COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode / 2; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); auto fac = util_getOneQubitDephasingFactor(prob); int braBit = util_getRankBitOfBraQubit(ketQubit, qureg); - kernel_densmatr_oneQubitDephasing_subB <<>> ( + kernel_densmatr_oneQubitDephasing_subB <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braBit, fac ); @@ -1078,13 +1099,14 @@ void gpu_densmatr_twoQubitDephasing_subB(Qureg qureg, int ketQubitA, int ketQubi #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); auto term = util_getTwoQubitDephasingTerm(prob); int braQubitA = util_getBraQubit(ketQubitA, qureg); int braQubitB = util_getBraQubit(ketQubitB, qureg); - kernel_densmatr_twoQubitDephasing_subB <<>> ( + kernel_densmatr_twoQubitDephasing_subB <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, // numAmps, not numCols ketQubitA, ketQubitB, braQubitA, braQubitB, term ); @@ -1106,12 +1128,13 @@ void gpu_densmatr_oneQubitDepolarising_subA(Qureg qureg, int ketQubit, qreal pro #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 4; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); int braQubit = util_getBraQubit(ketQubit, qureg); auto factors = util_getOneQubitDepolarisingFactors(prob); - kernel_densmatr_oneQubitDepolarising_subA <<>> ( + kernel_densmatr_oneQubitDepolarising_subA <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, factors.c1, factors.c2, factors.c3 ); @@ -1126,13 +1149,14 @@ void gpu_densmatr_oneQubitDepolarising_subB(Qureg qureg, int ketQubit, qreal pro #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 2; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); int braBit = util_getRankBitOfBraQubit(ketQubit, qureg); auto factors = util_getOneQubitDepolarisingFactors(prob); - kernel_densmatr_oneQubitDepolarising_subB <<>> ( + kernel_densmatr_oneQubitDepolarising_subB <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, ketQubit, braBit, factors.c1, factors.c2, factors.c3 ); @@ -1154,13 +1178,14 @@ void gpu_densmatr_twoQubitDepolarising_subA(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); int braQb1 = util_getBraQubit(ketQb1, qureg); int braQb2 = util_getBraQubit(ketQb2, qureg); auto c3 = util_getTwoQubitDepolarisingFactors(prob).c3; - kernel_densmatr_twoQubitDepolarising_subA <<>> ( + kernel_densmatr_twoQubitDepolarising_subA <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braQb1, braQb2, c3 ); @@ -1176,7 +1201,8 @@ void gpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 16; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); int braQb1 = util_getBraQubit(ketQb1, qureg); int braQb2 = util_getBraQubit(ketQb2, qureg); @@ -1185,7 +1211,7 @@ void gpu_densmatr_twoQubitDepolarising_subB(Qureg qureg, int ketQb1, int ketQb2, // each kernel invocation sums all 4 amps together, so adjusts c1 qreal altc1 = factors.c1 - factors.c2; - kernel_densmatr_twoQubitDepolarising_subB <<>> ( + kernel_densmatr_twoQubitDepolarising_subB <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braQb1, braQb2, altc1, factors.c2 ); @@ -1201,13 +1227,14 @@ void gpu_densmatr_twoQubitDepolarising_subC(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); int braQb1 = util_getBraQubit(ketQb1, qureg); int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg); auto c3 = util_getTwoQubitDepolarisingFactors(prob).c3; - kernel_densmatr_twoQubitDepolarising_subC <<>> ( + kernel_densmatr_twoQubitDepolarising_subC <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braQb1, braBit2, c3 ); @@ -1223,14 +1250,15 @@ void gpu_densmatr_twoQubitDepolarising_subD(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 8; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex offset = getBufferRecvInd(); int braQb1 = util_getBraQubit(ketQb1, qureg); int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg); auto factors = util_getTwoQubitDepolarisingFactors(prob); - kernel_densmatr_twoQubitDepolarising_subD <<>> ( + kernel_densmatr_twoQubitDepolarising_subD <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[offset], numThreads, ketQb1, ketQb2, braQb1, braBit2, factors.c1, factors.c2 ); @@ -1246,7 +1274,8 @@ void gpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg); int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg); @@ -1255,7 +1284,7 @@ void gpu_densmatr_twoQubitDepolarising_subE(Qureg qureg, int ketQb1, int ketQb2, qreal fac0 = 1 + factors.c3; qreal fac1 = factors.c1 - fac0; - kernel_densmatr_twoQubitDepolarising_subE <<>> ( + kernel_densmatr_twoQubitDepolarising_subE <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, ketQb1, ketQb2, braBit1, braBit2, fac0, fac1 ); @@ -1271,14 +1300,15 @@ void gpu_densmatr_twoQubitDepolarising_subF(Qureg qureg, int ketQb1, int ketQb2, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 4; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex offset = getBufferRecvInd(); int braBit1 = util_getRankBitOfBraQubit(ketQb1, qureg); int braBit2 = util_getRankBitOfBraQubit(ketQb2, qureg); auto c2 = util_getTwoQubitDepolarisingFactors(prob).c2; - kernel_densmatr_twoQubitDepolarising_subF <<>> ( + kernel_densmatr_twoQubitDepolarising_subF <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[offset], numThreads, ketQb1, ketQb2, braBit1, braBit2, c2 ); @@ -1300,12 +1330,13 @@ void gpu_densmatr_oneQubitPauliChannel_subA(Qureg qureg, int ketQubit, qreal pI, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 4; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); int braQubit = util_getBraQubit(ketQubit, qureg); auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ); - kernel_densmatr_oneQubitPauliChannel_subA <<>> ( + kernel_densmatr_oneQubitPauliChannel_subA <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, factors.c1, factors.c2, factors.c3, factors.c4 ); @@ -1321,13 +1352,14 @@ void gpu_densmatr_oneQubitPauliChannel_subB(Qureg qureg, int ketQubit, qreal pI, #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 2; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); int braBit = util_getRankBitOfBraQubit(ketQubit, qureg); auto factors = util_getOneQubitPauliChannelFactors(pI, pX, pY, pZ); - kernel_densmatr_oneQubitPauliChannel_subB <<>> ( + kernel_densmatr_oneQubitPauliChannel_subB <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, ketQubit, braBit, factors.c1, factors.c2, factors.c3, factors.c4 ); @@ -1349,12 +1381,13 @@ void gpu_densmatr_oneQubitDamping_subA(Qureg qureg, int ketQubit, qreal prob) { #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 4; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); int braQubit = util_getBraQubit(ketQubit, qureg); auto factors = util_getOneQubitDampingFactors(prob); - kernel_densmatr_oneQubitDamping_subA <<>> ( + kernel_densmatr_oneQubitDamping_subA <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braQubit, prob, factors.c1, factors.c2 ); @@ -1370,11 +1403,12 @@ void gpu_densmatr_oneQubitDamping_subB(Qureg qureg, int qubit, qreal prob) { #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 2; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); auto c2 = util_getOneQubitDampingFactors(prob).c2; - kernel_densmatr_oneQubitDamping_subB <<>> ( + kernel_densmatr_oneQubitDamping_subB <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, qubit, c2 ); @@ -1389,12 +1423,13 @@ void gpu_densmatr_oneQubitDamping_subC(Qureg qureg, int ketQubit, qreal prob) { #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 2; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); auto braBit = util_getRankBitOfBraQubit(ketQubit, qureg); auto c1 = util_getOneQubitDampingFactors(prob).c1; - kernel_densmatr_oneQubitDamping_subC <<>> ( + kernel_densmatr_oneQubitDamping_subC <<>> ( toCuQcomps(qureg.gpuAmps), numThreads, ketQubit, braBit, c1 ); @@ -1409,10 +1444,11 @@ void gpu_densmatr_oneQubitDamping_subD(Qureg qureg, int qubit, qreal prob) { #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = qureg.numAmpsPerNode / 2; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex recvInd = getBufferRecvInd(); - kernel_densmatr_oneQubitDamping_subD <<>> ( + kernel_densmatr_oneQubitDamping_subD <<>> ( toCuQcomps(qureg.gpuAmps), &toCuQcomps(qureg.gpuCommBuffer)[recvInd], numThreads, qubit, prob ); @@ -1437,13 +1473,14 @@ void gpu_densmatr_partialTrace_sub(Qureg inQureg, Qureg outQureg, vector ta #if COMPILE_CUDA || COMPILE_CUQUANTUM qindex numThreads = outQureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); devints devTargs = targs; devints devPairTargs = pairTargs; devints devAllTargs = util_getSorted(targs, pairTargs); - kernel_densmatr_partialTrace_sub <<>> ( + kernel_densmatr_partialTrace_sub <<>> ( toCuQcomps(inQureg.gpuAmps), toCuQcomps(outQureg.gpuAmps), numThreads, getPtr(devTargs), getPtr(devPairTargs), getPtr(devAllTargs), targs.size() ); @@ -1557,13 +1594,14 @@ void gpu_statevec_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu #if COMPILE_CUDA qindex numThreads = qureg.numAmpsPerNode; - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); // allocate exponentially-big temporary memory (error if failed) devints devQubits = qubits; devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws - kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( + kernel_statevec_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( getPtr(devProbs), toCuQcomps(qureg.gpuAmps), numThreads, qureg.rank, qureg.logNumAmpsPerNode, getPtr(devQubits), devQubits.size() ); @@ -1591,7 +1629,8 @@ void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu // we decouple numColsPerNode and numThreads for clarity // (and in case parallelisation granularity ever changes); qindex numThreads = powerOf2(qureg.logNumColsPerNode); - qindex numBlocks = getNumBlocks(numThreads); + const int numThreadsPerBlock = gpu_getNumThreadsPerBlock(); + qindex numBlocks = getNumBlocks(numThreads, numThreadsPerBlock); qindex firstDiagInd = util_getLocalIndexOfFirstDiagonalAmp(qureg); qindex numAmpsPerCol = powerOf2(qureg.numQubits); @@ -1600,7 +1639,7 @@ void gpu_densmatr_calcProbsOfAllMultiQubitOutcomes_sub(qreal* outProbs, Qureg qu devints devQubits = qubits; devreals devProbs = getDeviceRealsVec(powerOf2(qubits.size())); // throws - kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( + kernel_densmatr_calcProbsOfAllMultiQubitOutcomes_sub <<>> ( getPtr(devProbs), toCuQcomps(qureg.gpuAmps), numThreads, firstDiagInd, numAmpsPerCol, qureg.rank, qureg.logNumAmpsPerNode,