diff --git a/include/engine/EngineBuildLoadNetwork.inl b/include/engine/EngineBuildLoadNetwork.inl index 73c9a30..2c46800 100644 --- a/include/engine/EngineBuildLoadNetwork.inl +++ b/include/engine/EngineBuildLoadNetwork.inl @@ -303,6 +303,8 @@ bool Engine::build(std::string onnxModelPath, const std::array &sub int32_t inputW = inputDims.d[3]; int32_t minInputWidth = std::max(m_options.minInputWidth, inputW); + //? in case there no dynamic width, use inputW anyway + if (not doesSupportDynamicWidth) {minInputWidth = inputW;} // Specify the optimization profile` if (doesSupportDynamicBatch) { @@ -393,4 +395,4 @@ bool Engine::build(std::string onnxModelPath, const std::array &sub Util::checkCudaErrorCode(cudaStreamDestroy(profileStream)); return true; -} \ No newline at end of file +} diff --git a/include/engine/EngineRunInference.inl b/include/engine/EngineRunInference.inl index 4542a42..1232bf5 100644 --- a/include/engine/EngineRunInference.inl +++ b/include/engine/EngineRunInference.inl @@ -51,6 +51,8 @@ bool Engine::runInference(const std::vector> &i cudaStream_t inferenceCudaStream; Util::checkCudaErrorCode(cudaStreamCreate(&inferenceCudaStream)); + //? this vector is REQUIRED, there stored lifetime (memory) for each input + //? better name would be matsLifetime / memoryLifetime / memoryStorage std::vector preprocessedInputs; // Preprocess all the inputs @@ -80,7 +82,7 @@ bool Engine::runInference(const std::vector> &i // Copy over the input data and perform the preprocessing auto mfloat = blobFromGpuMats(batchInput, m_subVals, m_divVals, m_normalize); preprocessedInputs.push_back(mfloat); - m_buffers[i] = mfloat.ptr(); + m_buffers[i] = mfloat.ptr(); //fix buffer indexing wrong for trt10... } // Ensure all dynamic bindings have been defined. diff --git a/include/engine/EngineUtilities.inl b/include/engine/EngineUtilities.inl index d469ee6..e5b2cb0 100644 --- a/include/engine/EngineUtilities.inl +++ b/include/engine/EngineUtilities.inl @@ -32,6 +32,8 @@ cv::cuda::GpuMat Engine::resizeKeepAspectRatioPadRightBottom(const cv::cuda:: cv::cuda::GpuMat re(unpad_h, unpad_w, CV_8UC3); cv::cuda::resize(input, re, re.size()); cv::cuda::GpuMat out(height, width, CV_8UC3, bgcolor); + //? this function position image at 0,0 + //? if you need center image, do change here and in post process indexes logic in your implementation re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows))); return out; } @@ -94,9 +96,101 @@ cv::cuda::GpuMat Engine::blobFromGpuMats(const std::vector CHECK(!batchInput.empty()) CHECK(batchInput[0].channels() == 3) - - cv::cuda::GpuMat gpu_dst(1, batchInput[0].rows * batchInput[0].cols * batchInput.size(), CV_8UC3); + //! rewrite was tested on real world nn + //! nn behave like wanted, but dataset is different to real data + //! code logic produce expected result, but expected result is not final valid output (it jumps around target position) + //! if you see error in rewrite, note it and change + + bool const& invert_channels = swapRB; + + std::vector const& individualMats = batchInput; + auto const& indMref = individualMats[0]; + size_t size = + individualMats.size() + * + indMref.cols //? assumed mats have valid size yet + * + indMref.rows + * + 3 //? assumed rgb + ; + + //? there gpu_dst is JUST memory, it is not valid cv mat + //? you cannot have size > 1<<31 since mats size is int variable + //? you cant set (height,width), you have to set (1,height*width) //* because of padding //%% if you really want bigger number, if depth (or other dim) < max allowed by opencv, use CV_32F(depth); result layout will be same + //cv::cuda::GpuMat gpu_dst(1, batchInput[0].rows * batchInput[0].cols * batchInput.size(), CV_8UC3); + cv::cuda::GpuMat gpu_dst(1, (int)size, CV_32FC(1)); + const size_t result_size_one_batch = size_t(indMref.cols) * size_t(indMref.rows); + + for(uint batch_index = 0; batch_index < individualMats.size(); ++batch_index) + { + cv::cuda::GpuMat mfloat; + if (normalize) { + // [0.f, 1.f] + individualMats[batch_index].convertTo(mfloat, CV_32FC3, 1.f / 255.f); //NOLINT magic 255.f + } else { + // [0.f, 255.f] + individualMats[batch_index].convertTo(mfloat, CV_32FC3); + } + // Apply scaling and mean subtraction + //NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-constant-array-index) //is it some sort of a joke? it is constant lol + cv::cuda::subtract(mfloat, cv::Scalar(subVals[invert_channels ? 2 : 0], subVals[1], subVals[invert_channels ? 0 : 2]), mfloat, cv::noArray(), -1); + //NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-constant-array-index) //is it some sort of a joke? it is constant lol + cv::cuda::divide(mfloat, cv::Scalar(divVals[invert_channels ? 2 : 0], divVals[1], divVals[invert_channels ? 0 : 2]), mfloat, 1, -1); + + /// + ////// + /// + + //? basically what happening here: + //? opencv provide split function, that should copy channels to 3 different regions efficiently + //? here this regions are pointers instead of standalone buffers + //? so basically just split image's channels to 3 given pointers + std::vector input_channels{ + cv::cuda::GpuMat( + mfloat.rows, + mfloat.cols, + CV_32FC1, + gpu_dst.ptr() //* note that it is float pointer + +//NOLINT pointer arithmetic + (result_size_one_batch * (invert_channels ? 2 : 0) /*R*/) + + //NOLINT pointer arithmetic + (result_size_one_batch * 3 * batch_index) + ), + cv::cuda::GpuMat( + mfloat.rows, + mfloat.cols, + CV_32FC1, + gpu_dst.ptr() + +//NOLINT pointer arithmetic + (result_size_one_batch *1 /*G*/) + + //NOLINT pointer arithmetic + (result_size_one_batch * 3 * batch_index) + ), + cv::cuda::GpuMat( + mfloat.rows, + mfloat.cols, + CV_32FC1, + gpu_dst.ptr() + +//NOLINT pointer arithmetic + (result_size_one_batch * (invert_channels ? 0 : 2) /*B*/) + + //NOLINT pointer arithmetic + (result_size_one_batch * 3 * batch_index) + ) + }; + cv::cuda::split(mfloat, input_channels); // HWC -> CHW + // ^^^ by batch index it also fill blocks by batch dimension + }//for all batches + + return gpu_dst; + + //* end of function, that was final return + + /* + //? original for compare + + //width here == result_size_batch size_t width = batchInput[0].cols * batchInput[0].rows; if (swapRB) { for (size_t img = 0; img < batchInput.size(); ++img) { @@ -115,26 +209,16 @@ cv::cuda::GpuMat Engine::blobFromGpuMats(const std::vector cv::cuda::split(batchInput[img], input_channels); // HWC -> CHW } } - cv::cuda::GpuMat mfloat; - if (normalize) { - // [0.f, 1.f] - gpu_dst.convertTo(mfloat, CV_32FC3, 1.f / 255.f); - } else { - // [0.f, 255.f] - gpu_dst.convertTo(mfloat, CV_32FC3); - } - - // Apply scaling and mean subtraction - cv::cuda::subtract(mfloat, cv::Scalar(subVals[0], subVals[1], subVals[2]), mfloat, cv::noArray(), -1); - cv::cuda::divide(mfloat, cv::Scalar(divVals[0], divVals[1], divVals[2]), mfloat, 1, -1); - - return mfloat; + */ } template void Engine::clearGpuBuffers() { if (!m_buffers.empty()) { // Free GPU memory of outputs const auto numInputs = m_inputDims.size(); + //FIX REQUIRE rewrite to work with trt10 + //fix here assumed that position is , but it isn't in trt10 + //fix and that repeats everywhere! for (int32_t outputBinding = numInputs; outputBinding < m_engine->getNbIOTensors(); ++outputBinding) { Util::checkCudaErrorCode(cudaFree(m_buffers[outputBinding])); } diff --git a/scripts/build_opencv.sh b/scripts/build_opencv.sh index bfb0818..7dc26b3 100755 --- a/scripts/build_opencv.sh +++ b/scripts/build_opencv.sh @@ -36,6 +36,12 @@ cmake -D CMAKE_BUILD_TYPE=RELEASE \ -D CUDNN_INCLUDE_DIR=/usr/local/cuda/include \ -D CUDNN_LIBRARY=/usr/local/cuda/lib64/libcudnn.so \ .. +#! cudnn is unused in this project, cudnn not required to work with cv::cuda::mat, cudnn not required for nms function +#? cudnn required only by trt itself, not for opencv usage in this project //(and trt not use opencv) +#-D CUDNN_INCLUDE_DIR=/usr/local/cuda/include \ +#-D CUDNN_LIBRARY=/usr/local/cuda/lib64/libcudnn.so \ +#-D OPENCV_DNN_CUDA=ON \ +#-D WITH_CUDNN=ON \ make -j 8 sudo make -j 8 install diff --git a/src/engine.h b/src/engine.h index d9caa0e..90486e7 100644 --- a/src/engine.h +++ b/src/engine.h @@ -55,6 +55,9 @@ struct Options { int32_t minInputWidth = -1; // Default to -1 --> expecting fixed input size // Optimal input width int32_t optInputWidth = -1; // Default to -1 --> expecting fixed input size + + //todo add support for dynamic height and depth + //todo properly implementing it requires additional interface, to avoid checking for param everywhere }; // Class to extend TensorRT logger