QuEST-Kit · otbrown · Apr 24, 2026 · Apr 24, 2026 · May 4, 2026 · May 4, 2026
diff --git a/quest/include/environment.h b/quest/include/environment.h
@@ -83,6 +83,14 @@ int isQuESTEnvInit();
 QuESTEnv getQuESTEnv();
 
 
+/** @notyetdoced
+ * GPU thread per block control
+ * This is somehow probably the best pre-existing place for this. It only really applies to GPU, because for
+ * OpenMP the user can just export OMP_NUM_THREADS or call omp_set_num_threads.
+ */
+int getQuESTGpuThreadsPerBlock();
+void setQuESTGpuThreadsPerBlock(const int NEW_TPB);
+
 
 // end de-mangler
 #ifdef __cplusplus

diff --git a/quest/src/api/environment.cpp b/quest/src/api/environment.cpp
@@ -509,5 +509,16 @@ void getEnvironmentString(char str[200]) {
 }
 
 
+int getQuESTGpuThreadsPerBlock() {
+    QuESTEnv env = getQuESTEnv();
+    return env.isGpuAccelerated? gpu_getNumThreadsPerBlock() : 0;
+}
+
+void setQuESTGpuThreadsPerBlock(const int NEW_TPB) {
+    // just rely on the internal function to throw an error if there's no GPU support compiled
+    gpu_setNumThreadsPerBlock(NEW_TPB);
+    return;
+}
+
 // end de-mangler
 }
diff --git a/quest/src/cpu/cpu_config.cpp b/quest/src/cpu/cpu_config.cpp
@@ -79,9 +79,7 @@ int cpu_getAvailableNumThreads() {
 #if COMPILE_OPENMP
     int n = -1;
 
-    #pragma omp parallel shared(n)
-    #pragma omp single
-    n = omp_get_num_threads();
+    n = omp_get_max_threads();
 
     return n;
 #else

diff --git a/quest/src/gpu/gpu_config.cpp b/quest/src/gpu/gpu_config.cpp
@@ -41,6 +41,7 @@
     #include "quest/src/gpu/cuda_to_hip.hpp"
 #endif
 
+int numThreadsPerBlock = 128;
 
 
 /*
@@ -330,6 +331,24 @@ qindex gpu_getMaxNumConcurrentThreads() {
  * ENVIRONMENT MANAGEMENT
  */
 
+int gpu_getNumThreadsPerBlock() {
+#if COMPILE_CUDA
+    return numThreadsPerBlock;
+#else
+    error_gpuQueriedButGpuNotCompiled();
+    return -1;
+#endif
+}
+
+void gpu_setNumThreadsPerBlock(const int newThreadsPerBlock) {
+#if COMPILE_CUDA
+    numThreadsPerBlock = newThreadsPerBlock;
+#else
+    error_gpuQueriedButGpuNotCompiled();
+#endif
+    return;
+}
+
 
 std::array<char,17> getBoundGpuUuid() {
 #if COMPILE_CUDA

diff --git a/quest/src/gpu/gpu_config.hpp b/quest/src/gpu/gpu_config.hpp
@@ -19,7 +19,6 @@
 #include "quest/include/channels.h"
 
 
-
 /*
  * CUDA ERROR HANDLING
  */
@@ -65,6 +64,10 @@ qindex gpu_getMaxNumConcurrentThreads();
  * ENVIRONMENT MANAGEMENT
  */
 
+int gpu_getNumThreadsPerBlock();
+
+void gpu_setNumThreadsPerBlock(const int newThreadsPerBlock);
+
 void gpu_bindLocalGPUsToNodes();
 
 bool gpu_areAnyNodesBoundToSameGpu();
@@ -76,7 +79,6 @@ void gpu_initCuQuantum();
 void gpu_finalizeCuQuantum();
 
 
-
 /*
  * MEMORY MANAGEMENT
  */
@@ -122,4 +124,4 @@ size_t gpu_getCacheMemoryInBytes();
 
 
 
-#endif // GPU_CONFIG_HPP
+#endif // GPU_CONFIG_HPP
diff --git a/quest/src/gpu/gpu_kernels.cuh b/quest/src/gpu/gpu_kernels.cuh
@@ -46,23 +46,19 @@
  * THREAD MANAGEMENT
  */
 
-
-const int NUM_THREADS_PER_BLOCK = 128;
-
-
 __forceinline__ __device__ qindex getThreadInd() {
     return blockIdx.x*blockDim.x + threadIdx.x;
 }
 
 
-__host__ qindex getNumBlocks(qindex numThreads) {
+__host__ qindex getNumBlocks(qindex numThreads, const int numThreadsPerBlock) {
 
     /// @todo
     /// improve this with cudaOccupancyMaxPotentialBlockSize(),
     /// making it function specific
 
     // CUDA ceil
-    return ceil(numThreads / static_cast<qreal>(NUM_THREADS_PER_BLOCK));
+    return ceil(numThreads / static_cast<qreal>(numThreadsPerBlock));
 }