From d558be191b7e04ca5361fd75eabc12e5e4f06197 Mon Sep 17 00:00:00 2001
From: Erich Essmann <e.essmann@epcc.ed.ac.uk>
Date: Wed, 11 Feb 2026 12:30:15 +0000
Subject: [PATCH 1/2] second attempt

---
 quest/src/gpu/gpu_kernels.cuh | 16 +++++++++++-----
 quest/src/gpu/gpu_thrust.cuh  | 13 ++++++++-----
 quest/src/gpu/gpu_types.cuh   | 32 +++++++++++++++++++++++++++++++-
 3 files changed, 50 insertions(+), 11 deletions(-)

diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh
index 4f2a737e..0d33c26a 100644
--- a/quest/src/gpu/gpu_kernels.cuh
+++ b/quest/src/gpu/gpu_kernels.cuh
@@ -724,7 +724,7 @@ __global__ void kernel_statevec_setQuregToWeightedSum_sub(
     cu_qcomp amp = getCuQcomp(0, 0);
 
     for (int q=0; q<numInner; q++)
-        amp = amp + coeffs[q] * inAmps[q][n];
+        amp = addCuQcomp(amp, mulCuQcomp(coeffs[q], inAmps[q][n]));
 
     // must not modify outAmps[n] before computing the amp 
     // since outAmps can legally appear among inAmps
@@ -749,8 +749,11 @@ __global__ void kernel_densmatr_mixQureg_subB(
 
     cu_qcomp iAmp = inAmps[i];
     cu_qcomp jAmp = inAmps[j]; jAmp.y *= -1; // conj
-    
-    outAmps[n] = (outProb * outAmps[n]) + (inProb * iAmp * jAmp);
+
+    outAmps[n] = addCuQcomp(
+        mulCuQcomp(outProb, outAmps[n]),
+        mulCuQcomp(inProb, mulCuQcomp(iAmp, jAmp))
+    );
 }
 
 
@@ -769,8 +772,11 @@ __global__ void kernel_densmatr_mixQureg_subC(
 
     cu_qcomp iAmp = inAmps[i];
     cu_qcomp jAmp = inAmps[j]; jAmp.y *= -1; // conj
-    
-    outAmps[n] = (outProb * outAmps[n]) + (inProb * iAmp * jAmp);
+
+    outAmps[n] = addCuQcomp(
+        mulCuQcomp(outProb, outAmps[n]),
+        mulCuQcomp(inProb, mulCuQcomp(iAmp, jAmp))
+    );
 }
 
 
diff --git a/quest/src/gpu/gpu_thrust.cuh b/quest/src/gpu/gpu_thrust.cuh
index 3b653dfd..1a5d6c57 100644
--- a/quest/src/gpu/gpu_thrust.cuh
+++ b/quest/src/gpu/gpu_thrust.cuh
@@ -382,8 +382,11 @@ struct functor_mixAmps {
     functor_mixAmps(qreal out, qreal in) : outProb(out), inProb(in) {}
 
     __host__ __device__ cu_qcomp operator()(cu_qcomp outAmp, cu_qcomp inAmp) {
-        
-        return (outProb * outAmp) + (inProb * inAmp);
+
+        return addCuQcomp(
+            mulCuQcomp(outProb, outAmp),
+            mulCuQcomp(inProb, inAmp)
+        );
     }
 };
 
@@ -533,7 +536,7 @@ struct functor_projectStateVec {
         // return amp scaled by zero or renorm, depending on whether n has projected substate
         qindex val = getValueOfBits(n, targetsPtr, numBits);
         qreal fac = renorm * (val == retainValue);
-        return fac * amp;
+        return mulCuQcomp(fac, amp);
     }
 };
 
@@ -579,7 +582,7 @@ struct functor_projectDensMatr {
 
         // multiply amp with renorm or zero if values disagree with given outcomes
         qreal fac = renorm * (v1 == v2) * (retainValue == v1);
-        return fac * amp;
+        return mulCuQcomp(fac, amp);
     }
 };
 
@@ -1082,4 +1085,4 @@ void thrust_statevec_initUnnormalisedUniformlyRandomPureStateAmps_sub(Qureg qure
 
 
 
-#endif // GPU_THRUST_HPP
\ No newline at end of file
+#endif // GPU_THRUST_HPP
diff --git a/quest/src/gpu/gpu_types.cuh b/quest/src/gpu/gpu_types.cuh
index a934ecef..6ff57774 100644
--- a/quest/src/gpu/gpu_types.cuh
+++ b/quest/src/gpu/gpu_types.cuh
@@ -134,6 +134,36 @@ __host__ inline std::array<cu_qcomp,16> unpackMatrixToCuQcomps(CompMatr2 in) {
 
 
 
+/*
+ * cu_qcomp ARITHMETIC HELPERS
+ *
+ * These explicitly implement component-wise arithmetic and are used in
+ * critical kernels/functors where backend operator overload behaviour has
+ * varied across toolchains.
+ */
+
+
+INLINE cu_qcomp addCuQcomp(cu_qcomp a, cu_qcomp b) {
+    return getCuQcomp(a.x + b.x, a.y + b.y);
+}
+
+INLINE cu_qcomp mulCuQcomp(cu_qcomp a, cu_qcomp b) {
+    return getCuQcomp(
+        (a.x * b.x) - (a.y * b.y),
+        (a.x * b.y) + (a.y * b.x)
+    );
+}
+
+INLINE cu_qcomp mulCuQcomp(cu_qcomp a, qreal b) {
+    return getCuQcomp(a.x * b, a.y * b);
+}
+
+INLINE cu_qcomp mulCuQcomp(qreal b, cu_qcomp a) {
+    return mulCuQcomp(a, b);
+}
+
+
+
 /*
  * cu_qcomp ARITHMETIC OVERLOADS
  *
@@ -271,4 +301,4 @@ INLINE cu_qcomp getCompPower(cu_qcomp base, cu_qcomp exponent) {
 
 
 
-#endif // GPU_TYPES_HPP
\ No newline at end of file
+#endif // GPU_TYPES_HPP

From 91152d173bbc2a25c3c5d423345343852accbfcb Mon Sep 17 00:00:00 2001
From: Erich Essmann <dc-essm1@login8a.pri.cosma.local>
Date: Wed, 8 Apr 2026 14:43:59 +0100
Subject: [PATCH 2/2] Fix HIP complex arithmetic on ROCm 6+

---
 quest/src/core/fastmath.hpp    | 42 +++++++++++++++++++---
 quest/src/gpu/cuda_to_hip.hpp  | 16 ++++++++-
 quest/src/gpu/gpu_kernels.cuh  | 64 +++++++++++++++++++++++-----------
 quest/src/gpu/gpu_thrust.cuh   | 18 +++++-----
 quest/src/gpu/gpu_types.cuh    | 58 +++++++++++++++++++++---------
 tests/unit/calculations.cpp    | 33 ++++++++++++++++++
 tests/unit/initialisations.cpp | 36 +++++++++++++++++++
 tests/unit/operations.cpp      | 33 ++++++++++++++++++
 8 files changed, 249 insertions(+), 51 deletions(-)

diff --git a/quest/src/core/fastmath.hpp b/quest/src/core/fastmath.hpp
index 6367e116..31f44315 100644
--- a/quest/src/core/fastmath.hpp
+++ b/quest/src/core/fastmath.hpp
@@ -59,6 +59,37 @@ INLINE int fast_getPlusOrMinusMaskedBitParity(qindex num, qindex mask) {
 }
 
 
+#ifdef USE_CU_QCOMP
+
+INLINE QCOMP_ALIAS fast_addQcompAlias(QCOMP_ALIAS a, QCOMP_ALIAS b) {
+    return addCuQcomp(a, b);
+}
+
+INLINE QCOMP_ALIAS fast_mulQcompAlias(QCOMP_ALIAS a, QCOMP_ALIAS b) {
+    return mulCuQcomp(a, b);
+}
+
+INLINE QCOMP_ALIAS fast_divQcompAlias(QCOMP_ALIAS a, QCOMP_ALIAS b) {
+    return divCuQcomp(a, b);
+}
+
+#else
+
+INLINE QCOMP_ALIAS fast_addQcompAlias(QCOMP_ALIAS a, QCOMP_ALIAS b) {
+    return a + b;
+}
+
+INLINE QCOMP_ALIAS fast_mulQcompAlias(QCOMP_ALIAS a, QCOMP_ALIAS b) {
+    return a * b;
+}
+
+INLINE QCOMP_ALIAS fast_divQcompAlias(QCOMP_ALIAS a, QCOMP_ALIAS b) {
+    return a / b;
+}
+
+#endif
+
+
 
 /*
  * INDEX ALGEBRA
@@ -165,7 +196,7 @@ INLINE QCOMP_ALIAS fast_getPauliStrElem(PauliStr str, qindex row, qindex col) {
         int p = getTwoAdjacentBits(str.lowPaulis, 2*t);
         int i = getBit(row, t);
         int j = getBit(col, t);
-        elem = elem * matrices[p][i][j]; // HIP-friendly avoiding *=
+        elem = fast_mulQcompAlias(elem, matrices[p][i][j]);
     }
 
     // could be compile-time unrolled into 32 iterations
@@ -173,7 +204,7 @@ INLINE QCOMP_ALIAS fast_getPauliStrElem(PauliStr str, qindex row, qindex col) {
         int p = getTwoAdjacentBits(str.highPaulis, 2*t);
         int i = getBit(row, t + numPaulisPerMask);
         int j = getBit(col, t + numPaulisPerMask);
-        elem = elem * matrices[p][i][j];
+        elem = fast_mulQcompAlias(elem, matrices[p][i][j]);
     }
 
     return elem;
@@ -190,7 +221,10 @@ INLINE QCOMP_ALIAS fast_getPauliStrSumElem(QCOMP_ALIAS* coeffs, PauliStr* string
 
     // this loop is expected exponentially smaller than caller's loop
     for (qindex n=0; n<numTerms; n++)
-        elem = elem + coeffs[n] * fast_getPauliStrElem(strings[n], row, col); // += is HIP-incomaptible
+        elem = fast_addQcompAlias(
+            elem,
+            fast_mulQcompAlias(coeffs[n], fast_getPauliStrElem(strings[n], row, col))
+        );
 
     return elem;
 }
@@ -200,4 +234,4 @@ INLINE QCOMP_ALIAS fast_getPauliStrSumElem(QCOMP_ALIAS* coeffs, PauliStr* string
 // avoid exposing alias macro outside header
 #undef QCOMP_ALIAS
 
-#endif // FASTMATH_HPP
\ No newline at end of file
+#endif // FASTMATH_HPP
diff --git a/quest/src/gpu/cuda_to_hip.hpp b/quest/src/gpu/cuda_to_hip.hpp
index 880291c7..4fa6e560 100644
--- a/quest/src/gpu/cuda_to_hip.hpp
+++ b/quest/src/gpu/cuda_to_hip.hpp
@@ -69,6 +69,20 @@ static constexpr int maxWarpsPerBlock = 1024/WARPSIZE;
 #define cuDoubleComplex hipDoubleComplex
 #define make_cuFloatComplex make_hipFloatComplex
 #define make_cuDoubleComplex make_hipDoubleComplex
+#define cuCadd hipCadd
+#define cuCaddf hipCaddf
+#define cuCsub hipCsub
+#define cuCsubf hipCsubf
+#define cuCmul hipCmul
+#define cuCmulf hipCmulf
+#define cuCdiv hipCdiv
+#define cuCdivf hipCdivf
+#define cuConj hipConj
+#define cuConjf hipConjf
+#define cuCreal hipCreal
+#define cuCrealf hipCrealf
+#define cuCimag hipCimag
+#define cuCimagf hipCimagf
 
 
 static void __attribute__((unused)) check(const hipError_t err, const char *const file, const int line)
@@ -79,4 +93,4 @@ static void __attribute__((unused)) check(const hipError_t err, const char *cons
   exit(err);
 }
 
-#endif //CUDA_TO_HIP_HPP
\ No newline at end of file
+#endif //CUDA_TO_HIP_HPP
diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh
index 0d33c26a..30a3264b 100644
--- a/quest/src/gpu/gpu_kernels.cuh
+++ b/quest/src/gpu/gpu_kernels.cuh
@@ -214,8 +214,8 @@ __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subA(
     cu_qcomp amp0 = amps[i0];
     cu_qcomp amp1 = amps[i1];
 
-    amps[i0] = m00*amp0 + m01*amp1;
-    amps[i1] = m10*amp0 + m11*amp1;
+    amps[i0] = addCuQcomp(mulCuQcomp(m00, amp0), mulCuQcomp(m01, amp1));
+    amps[i1] = addCuQcomp(mulCuQcomp(m10, amp0), mulCuQcomp(m11, amp1));
 }
 
 
@@ -234,7 +234,7 @@ __global__ void kernel_statevec_anyCtrlOneTargDenseMatr_subB(
     qindex i = insertBitsWithMaskedValues(n, ctrls, numCtrlBits, ctrlStateMask);
 
     // caller offsets buffer by receive-index
-    amps[i] = fac0*amps[i] + fac1*buffer[n];
+    amps[i] = addCuQcomp(mulCuQcomp(fac0, amps[i]), mulCuQcomp(fac1, buffer[n]));
 }
 
 
@@ -271,10 +271,22 @@ __global__ void kernel_statevec_anyCtrlTwoTargDenseMatr_sub(
     cu_qcomp amp11 = amps[i11];
 
     // amps[i_n] = sum_j elems[n][j] amp[i_n]
-    amps[i00] = m00*amp00 + m01*amp01 + m02*amp10 + m03*amp11;
-    amps[i01] = m10*amp00 + m11*amp01 + m12*amp10 + m13*amp11;
-    amps[i10] = m20*amp00 + m21*amp01 + m22*amp10 + m23*amp11;
-    amps[i11] = m30*amp00 + m31*amp01 + m32*amp10 + m33*amp11;
+    amps[i00] = addCuQcomp(
+        addCuQcomp(mulCuQcomp(m00, amp00), mulCuQcomp(m01, amp01)),
+        addCuQcomp(mulCuQcomp(m02, amp10), mulCuQcomp(m03, amp11))
+    );
+    amps[i01] = addCuQcomp(
+        addCuQcomp(mulCuQcomp(m10, amp00), mulCuQcomp(m11, amp01)),
+        addCuQcomp(mulCuQcomp(m12, amp10), mulCuQcomp(m13, amp11))
+    );
+    amps[i10] = addCuQcomp(
+        addCuQcomp(mulCuQcomp(m20, amp00), mulCuQcomp(m21, amp01)),
+        addCuQcomp(mulCuQcomp(m22, amp10), mulCuQcomp(m23, amp11))
+    );
+    amps[i11] = addCuQcomp(
+        addCuQcomp(mulCuQcomp(m30, amp00), mulCuQcomp(m31, amp01)),
+        addCuQcomp(mulCuQcomp(m32, amp10), mulCuQcomp(m33, amp11))
+    );
 }
 
 
@@ -354,7 +366,7 @@ __global__ void kernel_statevec_anyCtrlFewTargDenseMatr(
                 elem.y *= -1;
 
             // thread-private cache is accessed with compile-time known index
-            amps[i] = amps[i] + (elem * privateCache[l]);
+            amps[i] = addCuQcomp(amps[i], mulCuQcomp(elem, privateCache[l]));
         }
     }
 }
@@ -415,7 +427,7 @@ __global__ void kernel_statevec_anyCtrlManyTargDenseMatr(
                 if constexpr (ApplyConj)
                     elem.y *= -1;
 
-                amps[i] = amps[i] + (elem * globalCache[j]);
+                amps[i] = addCuQcomp(amps[i], mulCuQcomp(elem, globalCache[j]));
 
                 /// @todo
                 /// qureg.cpuAmps[i] is being serially updated by only this thread,
@@ -462,7 +474,8 @@ __global__ void kernel_statevec_anyCtrlOneTargDiagMatr_sub(
     qindex i = concatenateBits(rank, j, logNumAmpsPerNode);
 
     int b = getBit(i, targ);
-    amps[j] = amps[j] * (m1 + b * (m2 - m1));
+    cu_qcomp elem = addCuQcomp(m1, mulCuQcomp((qreal) b, subCuQcomp(m2, m1)));
+    amps[j] = mulCuQcomp(amps[j], elem);
 }
 
 
@@ -503,7 +516,7 @@ __global__ void kernel_statevec_anyCtrlTwoTargDiagMatr_sub(
     // k = local elem index
     int k = getTwoBits(i, targ2, targ1);
     cu_qcomp elems[] = {m1, m2, m3, m4};
-    amps[j] = amps[j] * elems[k];
+    amps[j] = mulCuQcomp(amps[j], elems[k]);
 }
 
 
@@ -553,7 +566,7 @@ __global__ void kernel_statevec_anyCtrlAnyTargDiagMatr_sub(
     if constexpr (ApplyConj)
         elem.y *= -1;
 
-    amps[j] = amps[j] * elem;
+    amps[j] = mulCuQcomp(amps[j], elem);
 }
 
 
@@ -595,10 +608,10 @@ __global__ void kernel_densmatr_allTargDiagMatr_sub(
         if constexpr (ConjRight)
             term.y *= -1;
 
-        fac = fac * term;
+        fac = mulCuQcomp(fac, term);
     }
 
-    amps[n] = amps[n] * fac;
+    amps[n] = mulCuQcomp(amps[n], fac);
 }
 
 
@@ -636,15 +649,21 @@ __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subA(
     // determine whether to multiply amps by +-1 or +-i
     int parA = cudaGetBitMaskParity(iA & maskYZ);
     int parB = cudaGetBitMaskParity(iB & maskYZ);
-    cu_qcomp coeffA = powI * fast_getPlusOrMinusOne(parA);
-    cu_qcomp coeffB = powI * fast_getPlusOrMinusOne(parB);
+    cu_qcomp coeffA = mulCuQcomp(powI, (qreal) fast_getPlusOrMinusOne(parA));
+    cu_qcomp coeffB = mulCuQcomp(powI, (qreal) fast_getPlusOrMinusOne(parB));
 
     cu_qcomp ampA = amps[iA];
     cu_qcomp ampB = amps[iB];
 
     // mix or swap scaled amp pair
-    amps[iA] = (ampFac * ampA) + (pairAmpFac * coeffB * ampB);
-    amps[iB] = (ampFac * ampB) + (pairAmpFac * coeffA * ampA);
+    amps[iA] = addCuQcomp(
+        mulCuQcomp(ampFac, ampA),
+        mulCuQcomp(pairAmpFac, mulCuQcomp(coeffB, ampB))
+    );
+    amps[iB] = addCuQcomp(
+        mulCuQcomp(ampFac, ampB),
+        mulCuQcomp(pairAmpFac, mulCuQcomp(coeffA, ampA))
+    );
 }
 
 
@@ -671,9 +690,12 @@ __global__ void kernel_statevector_anyCtrlPauliTensorOrGadget_subB(
 
     // determine whether to multiply buffer amp by +-1 or +-i
     int par = cudaGetBitMaskParity(k & maskYZ);
-    cu_qcomp coeff = powI * fast_getPlusOrMinusOne(par);
+    cu_qcomp coeff = mulCuQcomp(powI, (qreal) fast_getPlusOrMinusOne(par));
 
-    amps[i] = (thisAmpFac * amps[i]) + (otherAmpFac * coeff * buffer[j]);
+    amps[i] = addCuQcomp(
+        mulCuQcomp(thisAmpFac, amps[i]),
+        mulCuQcomp(otherAmpFac, mulCuQcomp(coeff, buffer[j]))
+    );
 }
 
 
@@ -701,7 +723,7 @@ __global__ void kernel_statevector_anyCtrlAnyTargZOrPhaseGadget_sub(
     int p = cudaGetBitMaskParity(i & targMask);
 
     cu_qcomp facs[] = {fac0, fac1};
-    amps[i] = amps[i] * facs[p];
+    amps[i] = mulCuQcomp(amps[i], facs[p]);
 }
 
 
diff --git a/quest/src/gpu/gpu_thrust.cuh b/quest/src/gpu/gpu_thrust.cuh
index 1a5d6c57..c49eb625 100644
--- a/quest/src/gpu/gpu_thrust.cuh
+++ b/quest/src/gpu/gpu_thrust.cuh
@@ -200,14 +200,14 @@ struct functor_getAmpReal {
 struct functor_getAmpConjProd {
 
     __host__ __device__ cu_qcomp operator()(cu_qcomp braAmp, cu_qcomp ketAmp) { 
-        return getCompConj(braAmp) * ketAmp;
+        return mulCuQcomp(getCompConj(braAmp), ketAmp);
     }
 };
 
 struct functor_getNormOfAmpDif {
 
     __host__ __device__ qreal operator()(cu_qcomp amp1, cu_qcomp amp2) { 
-        return getCompNorm(amp1 - amp2);
+        return getCompNorm(subCuQcomp(amp1, amp2));
     }
 };
 
@@ -271,7 +271,7 @@ struct functor_getExpecStateVecPauliTerm {
         int sign = fast_getPlusOrMinusOne(par);
 
         // sign excludes i^numY contribution
-        return sign * getCompConj(amps[n]) * pairAmps[j]; // pairAmps may be amps or buffer
+        return mulCuQcomp((qreal) sign, mulCuQcomp(getCompConj(amps[n]), pairAmps[j]));
     }
 };
 
@@ -326,7 +326,7 @@ struct functor_getExpecDensMatrDiagMatrTerm {
 
         qindex i = fast_getQuregLocalIndexOfDiagonalAmp(n, firstDiagInd, numAmpsPerCol);
 
-        return amps[i] * elem;
+        return mulCuQcomp(amps[i], elem);
     }
 };
 
@@ -413,7 +413,7 @@ struct functor_multiplyElemPowerWithAmpOrNorm {
         if constexpr (Norm)
             quregAmp = getCuQcomp(getCompNorm(quregAmp), 0);
 
-        return matrElem * quregAmp;
+        return mulCuQcomp(matrElem, quregAmp);
     }
 };
 
@@ -498,7 +498,7 @@ struct functor_getFidelityTerm {
         } else
             rowAmp = getCompConj(rowAmp);
 
-        cu_qcomp fid = rhoAmp * rowAmp * colAmp;
+        cu_qcomp fid = mulCuQcomp(mulCuQcomp(rhoAmp, rowAmp), colAmp);
         return fid;
     }
 };
@@ -928,7 +928,7 @@ cu_qcomp thrust_statevec_calcExpecPauliStr_subA(Qureg qureg, vector<int> x, vect
 
     cu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus<cu_qcomp>());
 
-    return value * toCuQcomp(util_getPowerOfI(y.size()));
+    return mulCuQcomp(value, toCuQcomp(util_getPowerOfI(y.size())));
 }
 
 
@@ -946,7 +946,7 @@ cu_qcomp thrust_statevec_calcExpecPauliStr_subB(Qureg qureg, vector<int> x, vect
 
     cu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus<cu_qcomp>());
 
-    return value * toCuQcomp(util_getPowerOfI(y.size()));
+    return mulCuQcomp(value, toCuQcomp(util_getPowerOfI(y.size())));
 }
 
 
@@ -964,7 +964,7 @@ cu_qcomp thrust_densmatr_calcExpecPauliStr_sub(Qureg qureg, vector<int> x, vecto
 
     cu_qcomp value = thrust::transform_reduce(indIter, endIter, functor, init, thrust::plus<cu_qcomp>());
 
-    return value * toCuQcomp(util_getPowerOfI(y.size()));
+    return mulCuQcomp(value, toCuQcomp(util_getPowerOfI(y.size())));
 }
 
 
diff --git a/quest/src/gpu/gpu_types.cuh b/quest/src/gpu/gpu_types.cuh
index 6ff57774..60c233cd 100644
--- a/quest/src/gpu/gpu_types.cuh
+++ b/quest/src/gpu/gpu_types.cuh
@@ -137,21 +137,41 @@ __host__ inline std::array<cu_qcomp,16> unpackMatrixToCuQcomps(CompMatr2 in) {
 /*
  * cu_qcomp ARITHMETIC HELPERS
  *
- * These explicitly implement component-wise arithmetic and are used in
- * critical kernels/functors where backend operator overload behaviour has
- * varied across toolchains.
+ * These wrap the cuComplex helper API and are used in critical kernels/functors
+ * where backend operator overload behaviour has varied across toolchains.
  */
 
 
 INLINE cu_qcomp addCuQcomp(cu_qcomp a, cu_qcomp b) {
-    return getCuQcomp(a.x + b.x, a.y + b.y);
+#if (FLOAT_PRECISION == 1)
+    return cuCaddf(a, b);
+#else
+    return cuCadd(a, b);
+#endif
+}
+
+INLINE cu_qcomp subCuQcomp(cu_qcomp a, cu_qcomp b) {
+#if (FLOAT_PRECISION == 1)
+    return cuCsubf(a, b);
+#else
+    return cuCsub(a, b);
+#endif
 }
 
 INLINE cu_qcomp mulCuQcomp(cu_qcomp a, cu_qcomp b) {
-    return getCuQcomp(
-        (a.x * b.x) - (a.y * b.y),
-        (a.x * b.y) + (a.y * b.x)
-    );
+#if (FLOAT_PRECISION == 1)
+    return cuCmulf(a, b);
+#else
+    return cuCmul(a, b);
+#endif
+}
+
+INLINE cu_qcomp divCuQcomp(cu_qcomp a, cu_qcomp b) {
+#if (FLOAT_PRECISION == 1)
+    return cuCdivf(a, b);
+#else
+    return cuCdiv(a, b);
+#endif
 }
 
 INLINE cu_qcomp mulCuQcomp(cu_qcomp a, qreal b) {
@@ -167,11 +187,10 @@ INLINE cu_qcomp mulCuQcomp(qreal b, cu_qcomp a) {
 /*
  * cu_qcomp ARITHMETIC OVERLOADS
  *
- * which are only needed by NVCC because
- * HIP defines them for us. This good deed
- * goes punished; a HIP bug disables our
- * use of *= and += overloads, so kernels.cuh
- * has disgusting (x = x * y) statements. Bah!
+ * which are only needed by NVCC because HIP
+ * defines them for us. GPU kernels should still
+ * prefer the helpers above for complex-complex
+ * multiply and divide to avoid backend quirks.
  */
 
 
@@ -263,12 +282,19 @@ INLINE cu_qcomp operator * (const qreal& b, const cu_qcomp& a) {
 
 
 INLINE qreal getCompReal(cu_qcomp num) {
-    return num.x;
+#if (FLOAT_PRECISION == 1)
+    return cuCrealf(num);
+#else
+    return cuCreal(num);
+#endif
 }
 
 INLINE cu_qcomp getCompConj(cu_qcomp num) {
-    num.y *= -1;
-    return num;
+#if (FLOAT_PRECISION == 1)
+    return cuConjf(num);
+#else
+    return cuConj(num);
+#endif
 }
 
 INLINE qreal getCompNorm(cu_qcomp num) {
diff --git a/tests/unit/calculations.cpp b/tests/unit/calculations.cpp
index 4b4db284..c08c98cc 100644
--- a/tests/unit/calculations.cpp
+++ b/tests/unit/calculations.cpp
@@ -178,6 +178,39 @@ TEST_CASE( "calcExpecPauliStr", TEST_CATEGORY ) {
 }
 
 
+TEST_CASE( "calcExpecPauliStr GPU regression preserves complex contributions", TEST_CATEGORY ) {
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        if (!getQuESTEnv().isGpuAccelerated)
+            return;
+
+        int numQubits = getNumCachedQubits();
+        qindex dim = getPow2(numQubits);
+        qreal amp = 1 / std::sqrt((qreal) 2);
+        qvector ref = getZeroVector(dim);
+        PauliStr str = getPauliStr("Y", {0});
+
+        ref[0] = qcomp(amp, 0);
+        ref[1] = qcomp(0, amp);
+
+        qreal expected = std::real(getReferenceExpectationValue(ref, str));
+
+        for (auto& [label, qureg] : getCachedStatevecs()) {
+
+            if (!qureg.isGpuAccelerated)
+                continue;
+
+            DYNAMIC_SECTION( label ) {
+
+                setQuregToReference(qureg, ref);
+                REQUIRE_AGREE( calcExpecPauliStr(qureg, str), expected );
+            }
+        }
+    }
+}
+
+
 
 TEST_CASE( "calcExpecPauliStrSum", TEST_CATEGORY ) {
 
diff --git a/tests/unit/initialisations.cpp b/tests/unit/initialisations.cpp
index ac1f1abd..40a6c712 100644
--- a/tests/unit/initialisations.cpp
+++ b/tests/unit/initialisations.cpp
@@ -406,6 +406,42 @@ TEST_CASE( "setQuregToPauliStrSum", TEST_CATEGORY ) {
 }
 
 
+TEST_CASE( "setQuregToPauliStrSum GPU regression preserves imaginary amplitudes", TEST_CATEGORY ) {
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        if (!getQuESTEnv().isGpuAccelerated)
+            return;
+
+        int numQubits = getNumCachedQubits();
+        std::vector<PauliStr> strings = {
+            getPauliStr("X", {0}),
+            getPauliStr("Y", {0})
+        };
+        std::vector<qcomp> coeffs = {
+            qcomp(0.25, 0.50),
+            qcomp(-0.50, 0.75)
+        };
+        PauliStrSum sum = createPauliStrSum(strings, coeffs);
+        qmatrix refMat = getMatrix(sum, numQubits);
+
+        for (auto& [label, qureg] : getCachedDensmatrs()) {
+
+            if (!qureg.isGpuAccelerated)
+                continue;
+
+            DYNAMIC_SECTION( label ) {
+
+                setQuregToPauliStrSum(qureg, sum);
+                REQUIRE_AGREE( qureg, refMat );
+            }
+        }
+
+        destroyPauliStrSum(sum);
+    }
+}
+
+
 TEST_CASE( "setQuregToWeightedSum", TEST_CATEGORY ) {
 
     SECTION( LABEL_CORRECTNESS ) {
diff --git a/tests/unit/operations.cpp b/tests/unit/operations.cpp
index 0e33220d..ab449581 100644
--- a/tests/unit/operations.cpp
+++ b/tests/unit/operations.cpp
@@ -2365,6 +2365,39 @@ TEST_CASE( "applyNonUnitaryPauliGadget", TEST_CATEGORY_OPS ) {
 }
 
 
+TEST_CASE( "applyPauliY GPU regression preserves imaginary amplitudes", TEST_CATEGORY_OPS ) {
+
+    SECTION( LABEL_CORRECTNESS ) {
+
+        if (!getQuESTEnv().isGpuAccelerated)
+            return;
+
+        int numQubits = getNumCachedQubits();
+        qindex dim = getPow2(numQubits);
+        qreal amp = 1 / std::sqrt((qreal) dim);
+        qvector in = getConstantVector(dim, qcomp(amp, 0));
+        qvector ref = in;
+
+        applyReferenceOperator(ref, {0}, FixedMatrices::Y);
+
+        for (auto& [label, qureg] : getCachedStatevecs()) {
+
+            if (!qureg.isGpuAccelerated)
+                continue;
+
+            DYNAMIC_SECTION( label ) {
+
+                setQuregToReference(qureg, in);
+                applyPauliY(qureg, 0);
+
+                REQUIRE_AGREE( qureg, ref );
+                REQUIRE_AGREE( calcTotalProb(qureg), getReferenceProbability(ref) );
+            }
+        }
+    }
+}
+
+
 /** @} (end defgroup) */