Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion include/engine/EngineBuildLoadNetwork.inl
Original file line number Diff line number Diff line change
Expand Up @@ -303,6 +303,8 @@ bool Engine<T>::build(std::string onnxModelPath, const std::array<float, 3> &sub
int32_t inputW = inputDims.d[3];

int32_t minInputWidth = std::max(m_options.minInputWidth, inputW);
//? in case there no dynamic width, use inputW anyway
if (not doesSupportDynamicWidth) {minInputWidth = inputW;}

// Specify the optimization profile`
if (doesSupportDynamicBatch) {
Expand Down Expand Up @@ -393,4 +395,4 @@ bool Engine<T>::build(std::string onnxModelPath, const std::array<float, 3> &sub

Util::checkCudaErrorCode(cudaStreamDestroy(profileStream));
return true;
}
}
4 changes: 3 additions & 1 deletion include/engine/EngineRunInference.inl
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>> &i
cudaStream_t inferenceCudaStream;
Util::checkCudaErrorCode(cudaStreamCreate(&inferenceCudaStream));

//? this vector is REQUIRED, there stored lifetime (memory) for each input
//? better name would be matsLifetime / memoryLifetime / memoryStorage
std::vector<cv::cuda::GpuMat> preprocessedInputs;

// Preprocess all the inputs
Expand Down Expand Up @@ -80,7 +82,7 @@ bool Engine<T>::runInference(const std::vector<std::vector<cv::cuda::GpuMat>> &i
// Copy over the input data and perform the preprocessing
auto mfloat = blobFromGpuMats(batchInput, m_subVals, m_divVals, m_normalize);
preprocessedInputs.push_back(mfloat);
m_buffers[i] = mfloat.ptr<void>();
m_buffers[i] = mfloat.ptr<void>(); //fix buffer indexing wrong for trt10...
}

// Ensure all dynamic bindings have been defined.
Expand Down
116 changes: 100 additions & 16 deletions include/engine/EngineUtilities.inl
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ cv::cuda::GpuMat Engine<T>::resizeKeepAspectRatioPadRightBottom(const cv::cuda::
cv::cuda::GpuMat re(unpad_h, unpad_w, CV_8UC3);
cv::cuda::resize(input, re, re.size());
cv::cuda::GpuMat out(height, width, CV_8UC3, bgcolor);
//? this function position image at 0,0
//? if you need center image, do change here and in post process indexes logic in your implementation
re.copyTo(out(cv::Rect(0, 0, re.cols, re.rows)));
return out;
}
Expand Down Expand Up @@ -94,9 +96,101 @@ cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat>

CHECK(!batchInput.empty())
CHECK(batchInput[0].channels() == 3)

cv::cuda::GpuMat gpu_dst(1, batchInput[0].rows * batchInput[0].cols * batchInput.size(), CV_8UC3);

//! rewrite was tested on real world nn
//! nn behave like wanted, but dataset is different to real data
//! code logic produce expected result, but expected result is not final valid output (it jumps around target position)
//! if you see error in rewrite, note it and change

bool const& invert_channels = swapRB;

std::vector<cv::cuda::GpuMat> const& individualMats = batchInput;
auto const& indMref = individualMats[0];
size_t size =
individualMats.size()
*
indMref.cols //? assumed mats have valid size yet
*
indMref.rows
*
3 //? assumed rgb
;

//? there gpu_dst is JUST memory, it is not valid cv mat
//? you cannot have size > 1<<31 since mats size is int variable
//? you cant set (height,width), you have to set (1,height*width) //* because of padding //%% if you really want bigger number, if depth (or other dim) < max allowed by opencv, use CV_32F(depth); result layout will be same
//cv::cuda::GpuMat gpu_dst(1, batchInput[0].rows * batchInput[0].cols * batchInput.size(), CV_8UC3);
cv::cuda::GpuMat gpu_dst(1, (int)size, CV_32FC(1));
const size_t result_size_one_batch = size_t(indMref.cols) * size_t(indMref.rows);

for(uint batch_index = 0; batch_index < individualMats.size(); ++batch_index)
{
cv::cuda::GpuMat mfloat;
if (normalize) {
// [0.f, 1.f]
individualMats[batch_index].convertTo(mfloat, CV_32FC3, 1.f / 255.f); //NOLINT magic 255.f
} else {
// [0.f, 255.f]
individualMats[batch_index].convertTo(mfloat, CV_32FC3);
}
// Apply scaling and mean subtraction
//NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-constant-array-index) //is it some sort of a joke? it is constant lol
cv::cuda::subtract(mfloat, cv::Scalar(subVals[invert_channels ? 2 : 0], subVals[1], subVals[invert_channels ? 0 : 2]), mfloat, cv::noArray(), -1);
//NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-constant-array-index) //is it some sort of a joke? it is constant lol
cv::cuda::divide(mfloat, cv::Scalar(divVals[invert_channels ? 2 : 0], divVals[1], divVals[invert_channels ? 0 : 2]), mfloat, 1, -1);

///
//////
///

//? basically what happening here:
//? opencv provide split function, that should copy channels to 3 different regions efficiently
//? here this regions are pointers instead of standalone buffers
//? so basically just split image's channels to 3 given pointers
std::vector<cv::cuda::GpuMat> input_channels{
cv::cuda::GpuMat(
mfloat.rows,
mfloat.cols,
CV_32FC1,
gpu_dst.ptr<float>() //* note that it is float pointer
+//NOLINT pointer arithmetic
(result_size_one_batch * (invert_channels ? 2 : 0) /*R*/)
+ //NOLINT pointer arithmetic
(result_size_one_batch * 3 * batch_index)
),
cv::cuda::GpuMat(
mfloat.rows,
mfloat.cols,
CV_32FC1,
gpu_dst.ptr<float>()
+//NOLINT pointer arithmetic
(result_size_one_batch *1 /*G*/)
+ //NOLINT pointer arithmetic
(result_size_one_batch * 3 * batch_index)
),
cv::cuda::GpuMat(
mfloat.rows,
mfloat.cols,
CV_32FC1,
gpu_dst.ptr<float>()
+//NOLINT pointer arithmetic
(result_size_one_batch * (invert_channels ? 0 : 2) /*B*/)
+ //NOLINT pointer arithmetic
(result_size_one_batch * 3 * batch_index)
)
};
cv::cuda::split(mfloat, input_channels); // HWC -> CHW
// ^^^ by batch index it also fill blocks by batch dimension
}//for all batches

return gpu_dst;

//* end of function, that was final return

/*
//? original for compare

//width here == result_size_batch
size_t width = batchInput[0].cols * batchInput[0].rows;
if (swapRB) {
for (size_t img = 0; img < batchInput.size(); ++img) {
Expand All @@ -115,26 +209,16 @@ cv::cuda::GpuMat Engine<T>::blobFromGpuMats(const std::vector<cv::cuda::GpuMat>
cv::cuda::split(batchInput[img], input_channels); // HWC -> CHW
}
}
cv::cuda::GpuMat mfloat;
if (normalize) {
// [0.f, 1.f]
gpu_dst.convertTo(mfloat, CV_32FC3, 1.f / 255.f);
} else {
// [0.f, 255.f]
gpu_dst.convertTo(mfloat, CV_32FC3);
}

// Apply scaling and mean subtraction
cv::cuda::subtract(mfloat, cv::Scalar(subVals[0], subVals[1], subVals[2]), mfloat, cv::noArray(), -1);
cv::cuda::divide(mfloat, cv::Scalar(divVals[0], divVals[1], divVals[2]), mfloat, 1, -1);

return mfloat;
*/
}

template <typename T> void Engine<T>::clearGpuBuffers() {
if (!m_buffers.empty()) {
// Free GPU memory of outputs
const auto numInputs = m_inputDims.size();
//FIX REQUIRE rewrite to work with trt10
//fix here assumed that position is <input0,input1,output0,output1>, but it isn't in trt10
//fix and that repeats everywhere!
for (int32_t outputBinding = numInputs; outputBinding < m_engine->getNbIOTensors(); ++outputBinding) {
Util::checkCudaErrorCode(cudaFree(m_buffers[outputBinding]));
}
Expand Down
6 changes: 6 additions & 0 deletions scripts/build_opencv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,12 @@ cmake -D CMAKE_BUILD_TYPE=RELEASE \
-D CUDNN_INCLUDE_DIR=/usr/local/cuda/include \
-D CUDNN_LIBRARY=/usr/local/cuda/lib64/libcudnn.so \
..
#! cudnn is unused in this project, cudnn not required to work with cv::cuda::mat, cudnn not required for nms function
#? cudnn required only by trt itself, not for opencv usage in this project //(and trt not use opencv)
#-D CUDNN_INCLUDE_DIR=/usr/local/cuda/include \
#-D CUDNN_LIBRARY=/usr/local/cuda/lib64/libcudnn.so \
#-D OPENCV_DNN_CUDA=ON \
#-D WITH_CUDNN=ON \

make -j 8
sudo make -j 8 install
3 changes: 3 additions & 0 deletions src/engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,9 @@ struct Options {
int32_t minInputWidth = -1; // Default to -1 --> expecting fixed input size
// Optimal input width
int32_t optInputWidth = -1; // Default to -1 --> expecting fixed input size

//todo add support for dynamic height and depth
//todo properly implementing it requires additional interface, to avoid checking for param everywhere
};

// Class to extend TensorRT logger
Expand Down