cyrusbehr · CharaVerKys · Apr 8, 2026 · Apr 8, 2026
diff --git a/include/engine/EngineBuildLoadNetwork.inl b/include/engine/EngineBuildLoadNetwork.inl
@@ -303,6 +303,8 @@ bool Engine<T>::build(std::string onnxModelPath, const std::array<float, 3> &sub
         int32_t inputW = inputDims.d[3];
 
         int32_t minInputWidth = std::max(m_options.minInputWidth, inputW);
+        //? in case there no dynamic width, use inputW anyway
+        if (not doesSupportDynamicWidth) {minInputWidth = inputW;}
 
         // Specify the optimization profile`
         if (doesSupportDynamicBatch) {
@@ -393,4 +395,4 @@ bool Engine<T>::build(std::string onnxModelPath, const std::array<float, 3> &sub
 
     Util::checkCudaErrorCode(cudaStreamDestroy(profileStream));
     return true;
-}
+}
diff --git a/include/engine/EngineRunInference.inl b/include/engine/EngineRunInference.inl
@@ -51,6 +51,8 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>> &i
     cudaStream_t inferenceCudaStream;
     Util::checkCudaErrorCode(cudaStreamCreate(&inferenceCudaStream));
 
+    //? this vector is REQUIRED, there stored lifetime (memory) for each input
+    //? better name would be matsLifetime / memoryLifetime / memoryStorage 
     std::vector<cv::cuda::GpuMat> preprocessedInputs;
 
     // Preprocess all the inputs
@@ -80,7 +82,7 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>> &i
         // Copy over the input data and perform the preprocessing
         auto mfloat = blobFromGpuMats(batchInput, m_subVals, m_divVals, m_normalize);
         preprocessedInputs.push_back(mfloat);
-        m_buffers[i] = mfloat.ptr<void>();
+        m_buffers[i] = mfloat.ptr<void>(); //fix buffer indexing wrong for trt10...
     }
 
     // Ensure all dynamic bindings have been defined.

diff --git a/include/engine/EngineUtilities.inl b/include/engine/EngineUtilities.inl
@@ -32,6 +32,8 @@ cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::
     cv::cuda::GpuMat re(unpad_h, unpad_w, CV_8UC3);
     cv::cuda::resize(input, re, re.size());
     cv::cuda::GpuMat out(height, width, CV_8UC3, bgcolor);
+    //? this function position image at 0,0
+    //? if you need center image, do change here and in post process indexes logic in your implementation
     re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)));
     return out;
 }
@@ -94,9 +96,101 @@ cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat>
 
     CHECK(!batchInput.empty())
     CHECK(batchInput[0].channels() == 3)
-
-    cv::cuda::GpuMat gpu_dst(1, batchInput[0].rows * batchInput[0].cols * batchInput.size(), CV_8UC3);
 
+    //! rewrite was tested on real world nn
+    //! nn behave like wanted, but dataset is different to real data
+    //! code logic produce expected result, but expected result is not final valid output (it jumps around target position)
+    //! if you see error in rewrite, note it and change
+
+    bool const&  invert_channels = swapRB;
+
+    std::vector<cv::cuda::GpuMat> const& individualMats = batchInput;
+    auto const& indMref = individualMats[0];
+    size_t size = 
+            individualMats.size()
+            *
+            indMref.cols //? assumed mats have valid size yet
+            *
+            indMref.rows
+            *
+            3 //? assumed rgb
+        ;
+
+    //? there gpu_dst is JUST memory, it is not valid cv mat
+    //? you cannot have size > 1<<31 since mats size is int variable
+    //? you cant set (height,width), you have to set (1,height*width) //* because of padding //%% if you really want bigger number, if depth (or other dim) < max allowed by opencv, use CV_32F(depth); result layout will be same
+    //cv::cuda::GpuMat gpu_dst(1, batchInput[0].rows * batchInput[0].cols * batchInput.size(), CV_8UC3);
+    cv::cuda::GpuMat gpu_dst(1, (int)size, CV_32FC(1));
+    const size_t result_size_one_batch = size_t(indMref.cols) * size_t(indMref.rows);
+
+    for(uint batch_index = 0; batch_index < individualMats.size(); ++batch_index)
+    {
+        cv::cuda::GpuMat mfloat;
+        if (normalize) {
+            // [0.f, 1.f]
+            individualMats[batch_index].convertTo(mfloat, CV_32FC3, 1.f / 255.f); //NOLINT magic 255.f
+        } else {
+            // [0.f, 255.f]
+            individualMats[batch_index].convertTo(mfloat, CV_32FC3);
+        }
+        // Apply scaling and mean subtraction
+        //NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-constant-array-index) //is it some sort of a joke? it is constant lol
+        cv::cuda::subtract(mfloat, cv::Scalar(subVals[invert_channels ? 2 : 0], subVals[1], subVals[invert_channels ? 0 : 2]), mfloat, cv::noArray(), -1);
+        //NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-constant-array-index) //is it some sort of a joke? it is constant lol
+        cv::cuda::divide(mfloat, cv::Scalar(divVals[invert_channels ? 2 : 0], divVals[1], divVals[invert_channels ? 0 : 2]), mfloat, 1, -1);
+
+        ///
+        //////
+        ///
+
+        //? basically what happening here:
+        //? opencv provide split function, that should copy channels to 3 different regions efficiently
+        //? here this regions are pointers instead of standalone buffers
+        //? so basically just split image's channels to 3 given pointers
+        std::vector<cv::cuda::GpuMat> input_channels{
+            cv::cuda::GpuMat(
+                    mfloat.rows,
+                    mfloat.cols,
+                    CV_32FC1,
+                    gpu_dst.ptr<float>() //* note that it is float pointer
+                        +//NOLINT pointer arithmetic
+                        (result_size_one_batch * (invert_channels ? 2 : 0) /*R*/)
+                        + //NOLINT pointer arithmetic
+                        (result_size_one_batch * 3 * batch_index)
+                ),
+            cv::cuda::GpuMat(
+                    mfloat.rows,
+                    mfloat.cols,
+                    CV_32FC1,
+                    gpu_dst.ptr<float>() 
+                        +//NOLINT pointer arithmetic
+                        (result_size_one_batch *1 /*G*/)
+                        + //NOLINT pointer arithmetic
+                        (result_size_one_batch * 3 * batch_index)
+                ),
+            cv::cuda::GpuMat(
+                    mfloat.rows,
+                    mfloat.cols,
+                    CV_32FC1,
+                    gpu_dst.ptr<float>() 
+                        +//NOLINT pointer arithmetic
+                        (result_size_one_batch * (invert_channels ? 0 : 2) /*B*/)
+                        + //NOLINT pointer arithmetic
+                        (result_size_one_batch * 3 * batch_index)
+                )
+        };
+        cv::cuda::split(mfloat, input_channels);  // HWC -> CHW
+        // ^^^ by batch index it also fill blocks by batch dimension
+    }//for all batches
+
+    return gpu_dst;
+
+    //* end of function, that was final return
+
+    /*
+     //? original for compare
+
+     //width here == result_size_batch
     size_t width = batchInput[0].cols * batchInput[0].rows;
     if (swapRB) {
         for (size_t img = 0; img < batchInput.size(); ++img) {
@@ -115,26 +209,16 @@ cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat>
             cv::cuda::split(batchInput[img], input_channels); // HWC -> CHW
         }
     }
-    cv::cuda::GpuMat mfloat;
-    if (normalize) {
-        // [0.f, 1.f]
-        gpu_dst.convertTo(mfloat, CV_32FC3, 1.f / 255.f);
-    } else {
-        // [0.f, 255.f]
-        gpu_dst.convertTo(mfloat, CV_32FC3);
-    }
-
-    // Apply scaling and mean subtraction
-    cv::cuda::subtract(mfloat, cv::Scalar(subVals[0], subVals[1], subVals[2]), mfloat, cv::noArray(), -1);
-    cv::cuda::divide(mfloat, cv::Scalar(divVals[0], divVals[1], divVals[2]), mfloat, 1, -1);
-
-    return mfloat;
+    */
 }
 
 template <typename T> void Engine<T>::clearGpuBuffers() {
     if (!m_buffers.empty()) {
         // Free GPU memory of outputs
         const auto numInputs = m_inputDims.size();
+        //FIX REQUIRE rewrite to work with trt10
+        //fix here assumed that position is <input0,input1,output0,output1>, but it isn't in trt10
+        //fix and that repeats everywhere!
         for (int32_t outputBinding = numInputs; outputBinding < m_engine->getNbIOTensors(); ++outputBinding) {
             Util::checkCudaErrorCode(cudaFree(m_buffers[outputBinding]));
         }

diff --git a/scripts/build_opencv.sh b/scripts/build_opencv.sh
@@ -36,6 +36,12 @@ cmake -D CMAKE_BUILD_TYPE=RELEASE \
 -D CUDNN_INCLUDE_DIR=/usr/local/cuda/include \
 -D CUDNN_LIBRARY=/usr/local/cuda/lib64/libcudnn.so \
 ..
+#! cudnn is unused in this project, cudnn not required to work with cv::cuda::mat, cudnn not required for nms function
+#? cudnn required only by trt itself, not for opencv usage in this project //(and trt not use opencv)
+#-D CUDNN_INCLUDE_DIR=/usr/local/cuda/include \
+#-D CUDNN_LIBRARY=/usr/local/cuda/lib64/libcudnn.so \
+#-D OPENCV_DNN_CUDA=ON \
+#-D WITH_CUDNN=ON \
 
 make -j 8
 sudo make -j 8 install
diff --git a/src/engine.h b/src/engine.h
@@ -55,6 +55,9 @@ struct Options {
     int32_t minInputWidth = -1; // Default to -1 --> expecting fixed input size
     // Optimal input width
     int32_t optInputWidth = -1; // Default to -1 --> expecting fixed input size
+
+    //todo add support for dynamic height and depth
+    //todo properly implementing it requires additional interface, to avoid checking for param everywhere
 };
 
 // Class to extend TensorRT logger