From d558be191b7e04ca5361fd75eabc12e5e4f06197 Mon Sep 17 00:00:00 2001 From: Erich Essmann Date: Wed, 11 Feb 2026 12:30:15 +0000 Subject: [PATCH 1/2] second attempt --- quest/src/gpu/gpu_kernels.cuh | 16 +++++++++++----- quest/src/gpu/gpu_thrust.cuh | 13 ++++++++----- quest/src/gpu/gpu_types.cuh | 32 +++++++++++++++++++++++++++++++- 3 files changed, 50 insertions(+), 11 deletions(-) diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh index 4f2a737e..0d33c26a 100644 --- a/quest/src/gpu/gpu_kernels.cuh +++ b/quest/src/gpu/gpu_kernels.cuh @@ -724,7 +724,7 @@ __global__ void kernel_statevec_setQuregToWeightedSum_sub( cu_qcomp amp = getCuQcomp(0, 0); for (int q=0; q unpackMatrixToCuQcomps(CompMatr2 in) { +/* + * cu_qcomp ARITHMETIC HELPERS + * + * These explicitly implement component-wise arithmetic and are used in + * critical kernels/functors where backend operator overload behaviour has + * varied across toolchains. + */ + + +INLINE cu_qcomp addCuQcomp(cu_qcomp a, cu_qcomp b) { + return getCuQcomp(a.x + b.x, a.y + b.y); +} + +INLINE cu_qcomp mulCuQcomp(cu_qcomp a, cu_qcomp b) { + return getCuQcomp( + (a.x * b.x) - (a.y * b.y), + (a.x * b.y) + (a.y * b.x) + ); +} + +INLINE cu_qcomp mulCuQcomp(cu_qcomp a, qreal b) { + return getCuQcomp(a.x * b, a.y * b); +} + +INLINE cu_qcomp mulCuQcomp(qreal b, cu_qcomp a) { + return mulCuQcomp(a, b); +} + + + /* * cu_qcomp ARITHMETIC OVERLOADS * @@ -271,4 +301,4 @@ INLINE cu_qcomp getCompPower(cu_qcomp base, cu_qcomp exponent) { -#endif // GPU_TYPES_HPP \ No newline at end of file +#endif // GPU_TYPES_HPP From 91152d173bbc2a25c3c5d423345343852accbfcb Mon Sep 17 00:00:00 2001 From: Erich Essmann Date: Wed, 8 Apr 2026 14:43:59 +0100 Subject: [PATCH 2/2] Fix HIP complex arithmetic on ROCm 6+ --- quest/src/core/fastmath.hpp | 42 +++++++++++++++++++--- quest/src/gpu/cuda_to_hip.hpp | 16 ++++++++- quest/src/gpu/gpu_kernels.cuh | 64 +++++++++++++++++++++++----------- quest/src/gpu/gpu_thrust.cuh | 18 +++++----- quest/src/gpu/gpu_types.cuh | 58 +++++++++++++++++++++--------- tests/unit/calculations.cpp | 33 ++++++++++++++++++ tests/unit/initialisations.cpp | 36 +++++++++++++++++++ tests/unit/operations.cpp | 33 ++++++++++++++++++ 8 files changed, 249 insertions(+), 51 deletions(-) diff --git a/quest/src/core/fastmath.hpp b/quest/src/core/fastmath.hpp index 6367e116..31f44315 100644 --- a/quest/src/core/fastmath.hpp +++ b/quest/src/core/fastmath.hpp @@ -59,6 +59,37 @@ INLINE int fast_getPlusOrMinusMaskedBitParity(qindex num, qindex mask) { } +#ifdef USE_CU_QCOMP + +INLINE QCOMP_ALIAS fast_addQcompAlias(QCOMP_ALIAS a, QCOMP_ALIAS b) { + return addCuQcomp(a, b); +} + +INLINE QCOMP_ALIAS fast_mulQcompAlias(QCOMP_ALIAS a, QCOMP_ALIAS b) { + return mulCuQcomp(a, b); +} + +INLINE QCOMP_ALIAS fast_divQcompAlias(QCOMP_ALIAS a, QCOMP_ALIAS b) { + return divCuQcomp(a, b); +} + +#else + +INLINE QCOMP_ALIAS fast_addQcompAlias(QCOMP_ALIAS a, QCOMP_ALIAS b) { + return a + b; +} + +INLINE QCOMP_ALIAS fast_mulQcompAlias(QCOMP_ALIAS a, QCOMP_ALIAS b) { + return a * b; +} + +INLINE QCOMP_ALIAS fast_divQcompAlias(QCOMP_ALIAS a, QCOMP_ALIAS b) { + return a / b; +} + +#endif + + /* * INDEX ALGEBRA @@ -165,7 +196,7 @@ INLINE QCOMP_ALIAS fast_getPauliStrElem(PauliStr str, qindex row, qindex col) { int p = getTwoAdjacentBits(str.lowPaulis, 2*t); int i = getBit(row, t); int j = getBit(col, t); - elem = elem * matrices[p][i][j]; // HIP-friendly avoiding *= + elem = fast_mulQcompAlias(elem, matrices[p][i][j]); } // could be compile-time unrolled into 32 iterations @@ -173,7 +204,7 @@ INLINE QCOMP_ALIAS fast_getPauliStrElem(PauliStr str, qindex row, qindex col) { int p = getTwoAdjacentBits(str.highPaulis, 2*t); int i = getBit(row, t + numPaulisPerMask); int j = getBit(col, t + numPaulisPerMask); - elem = elem * matrices[p][i][j]; + elem = fast_mulQcompAlias(elem, matrices[p][i][j]); } return elem; @@ -190,7 +221,10 @@ INLINE QCOMP_ALIAS fast_getPauliStrSumElem(QCOMP_ALIAS* coeffs, PauliStr* string // this loop is expected exponentially smaller than caller's loop for (qindex n=0; n x, vect cu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); - return value * toCuQcomp(util_getPowerOfI(y.size())); + return mulCuQcomp(value, toCuQcomp(util_getPowerOfI(y.size()))); } @@ -946,7 +946,7 @@ cu_qcomp thrust_statevec_calcExpecPauliStr_subB(Qureg qureg, vector x, vect cu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); - return value * toCuQcomp(util_getPowerOfI(y.size())); + return mulCuQcomp(value, toCuQcomp(util_getPowerOfI(y.size()))); } @@ -964,7 +964,7 @@ cu_qcomp thrust_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector x, vecto cu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus()); - return value * toCuQcomp(util_getPowerOfI(y.size())); + return mulCuQcomp(value, toCuQcomp(util_getPowerOfI(y.size()))); } diff --git a/quest/src/gpu/gpu_types.cuh b/quest/src/gpu/gpu_types.cuh index 6ff57774..60c233cd 100644 --- a/quest/src/gpu/gpu_types.cuh +++ b/quest/src/gpu/gpu_types.cuh @@ -137,21 +137,41 @@ __host__ inline std::array unpackMatrixToCuQcomps(CompMatr2 in) { /* * cu_qcomp ARITHMETIC HELPERS * - * These explicitly implement component-wise arithmetic and are used in - * critical kernels/functors where backend operator overload behaviour has - * varied across toolchains. + * These wrap the cuComplex helper API and are used in critical kernels/functors + * where backend operator overload behaviour has varied across toolchains. */ INLINE cu_qcomp addCuQcomp(cu_qcomp a, cu_qcomp b) { - return getCuQcomp(a.x + b.x, a.y + b.y); +#if (FLOAT_PRECISION == 1) + return cuCaddf(a, b); +#else + return cuCadd(a, b); +#endif +} + +INLINE cu_qcomp subCuQcomp(cu_qcomp a, cu_qcomp b) { +#if (FLOAT_PRECISION == 1) + return cuCsubf(a, b); +#else + return cuCsub(a, b); +#endif } INLINE cu_qcomp mulCuQcomp(cu_qcomp a, cu_qcomp b) { - return getCuQcomp( - (a.x * b.x) - (a.y * b.y), - (a.x * b.y) + (a.y * b.x) - ); +#if (FLOAT_PRECISION == 1) + return cuCmulf(a, b); +#else + return cuCmul(a, b); +#endif +} + +INLINE cu_qcomp divCuQcomp(cu_qcomp a, cu_qcomp b) { +#if (FLOAT_PRECISION == 1) + return cuCdivf(a, b); +#else + return cuCdiv(a, b); +#endif } INLINE cu_qcomp mulCuQcomp(cu_qcomp a, qreal b) { @@ -167,11 +187,10 @@ INLINE cu_qcomp mulCuQcomp(qreal b, cu_qcomp a) { /* * cu_qcomp ARITHMETIC OVERLOADS * - * which are only needed by NVCC because - * HIP defines them for us. This good deed - * goes punished; a HIP bug disables our - * use of *= and += overloads, so kernels.cuh - * has disgusting (x = x * y) statements. Bah! + * which are only needed by NVCC because HIP + * defines them for us. GPU kernels should still + * prefer the helpers above for complex-complex + * multiply and divide to avoid backend quirks. */ @@ -263,12 +282,19 @@ INLINE cu_qcomp operator * (const qreal& b, const cu_qcomp& a) { INLINE qreal getCompReal(cu_qcomp num) { - return num.x; +#if (FLOAT_PRECISION == 1) + return cuCrealf(num); +#else + return cuCreal(num); +#endif } INLINE cu_qcomp getCompConj(cu_qcomp num) { - num.y *= -1; - return num; +#if (FLOAT_PRECISION == 1) + return cuConjf(num); +#else + return cuConj(num); +#endif } INLINE qreal getCompNorm(cu_qcomp num) { diff --git a/tests/unit/calculations.cpp b/tests/unit/calculations.cpp index 4b4db284..c08c98cc 100644 --- a/tests/unit/calculations.cpp +++ b/tests/unit/calculations.cpp @@ -178,6 +178,39 @@ TEST_CASE( "calcExpecPauliStr", TEST_CATEGORY ) { } +TEST_CASE( "calcExpecPauliStr GPU regression preserves complex contributions", TEST_CATEGORY ) { + + SECTION( LABEL_CORRECTNESS ) { + + if (!getQuESTEnv().isGpuAccelerated) + return; + + int numQubits = getNumCachedQubits(); + qindex dim = getPow2(numQubits); + qreal amp = 1 / std::sqrt((qreal) 2); + qvector ref = getZeroVector(dim); + PauliStr str = getPauliStr("Y", {0}); + + ref[0] = qcomp(amp, 0); + ref[1] = qcomp(0, amp); + + qreal expected = std::real(getReferenceExpectationValue(ref, str)); + + for (auto& [label, qureg] : getCachedStatevecs()) { + + if (!qureg.isGpuAccelerated) + continue; + + DYNAMIC_SECTION( label ) { + + setQuregToReference(qureg, ref); + REQUIRE_AGREE( calcExpecPauliStr(qureg, str), expected ); + } + } + } +} + + TEST_CASE( "calcExpecPauliStrSum", TEST_CATEGORY ) { diff --git a/tests/unit/initialisations.cpp b/tests/unit/initialisations.cpp index ac1f1abd..40a6c712 100644 --- a/tests/unit/initialisations.cpp +++ b/tests/unit/initialisations.cpp @@ -406,6 +406,42 @@ TEST_CASE( "setQuregToPauliStrSum", TEST_CATEGORY ) { } +TEST_CASE( "setQuregToPauliStrSum GPU regression preserves imaginary amplitudes", TEST_CATEGORY ) { + + SECTION( LABEL_CORRECTNESS ) { + + if (!getQuESTEnv().isGpuAccelerated) + return; + + int numQubits = getNumCachedQubits(); + std::vector strings = { + getPauliStr("X", {0}), + getPauliStr("Y", {0}) + }; + std::vector coeffs = { + qcomp(0.25, 0.50), + qcomp(-0.50, 0.75) + }; + PauliStrSum sum = createPauliStrSum(strings, coeffs); + qmatrix refMat = getMatrix(sum, numQubits); + + for (auto& [label, qureg] : getCachedDensmatrs()) { + + if (!qureg.isGpuAccelerated) + continue; + + DYNAMIC_SECTION( label ) { + + setQuregToPauliStrSum(qureg, sum); + REQUIRE_AGREE( qureg, refMat ); + } + } + + destroyPauliStrSum(sum); + } +} + + TEST_CASE( "setQuregToWeightedSum", TEST_CATEGORY ) { SECTION( LABEL_CORRECTNESS ) { diff --git a/tests/unit/operations.cpp b/tests/unit/operations.cpp index 0e33220d..ab449581 100644 --- a/tests/unit/operations.cpp +++ b/tests/unit/operations.cpp @@ -2365,6 +2365,39 @@ TEST_CASE( "applyNonUnitaryPauliGadget", TEST_CATEGORY_OPS ) { } +TEST_CASE( "applyPauliY GPU regression preserves imaginary amplitudes", TEST_CATEGORY_OPS ) { + + SECTION( LABEL_CORRECTNESS ) { + + if (!getQuESTEnv().isGpuAccelerated) + return; + + int numQubits = getNumCachedQubits(); + qindex dim = getPow2(numQubits); + qreal amp = 1 / std::sqrt((qreal) dim); + qvector in = getConstantVector(dim, qcomp(amp, 0)); + qvector ref = in; + + applyReferenceOperator(ref, {0}, FixedMatrices::Y); + + for (auto& [label, qureg] : getCachedStatevecs()) { + + if (!qureg.isGpuAccelerated) + continue; + + DYNAMIC_SECTION( label ) { + + setQuregToReference(qureg, in); + applyPauliY(qureg, 0); + + REQUIRE_AGREE( qureg, ref ); + REQUIRE_AGREE( calcTotalProb(qureg), getReferenceProbability(ref) ); + } + } + } +} + + /** @} (end defgroup) */