Visual Servoing Platform version 3.7.0
Loading...
Searching...
No Matches
tutorial-dnn-tensorrt-live.cpp
1// This tutorial uses NVIDIA TensorRT inference framework to perform object detection.
2// The object detection model is provided as a `.onnx` file. The model will be parsed
3// and a GPU Inference Engine (GIE) will be created if it doesn't exist. This GIE is
4// specific to the platform you're using.
5//
6// This tutorial was tested on NVIDIA Jetson TX2.
7//
8// The object detection model used is `SSD_Mobilenet V1` (Single Shot MultiBox Detector)
9// pre-trained on PASCAL VOC dataset. It can detect 20 classes.
10// For more information about the model, see this link:
11//
12// https://github.com/qfgaohao/pytorch-ssd
13//
14
15#include <iostream>
16
17#include <visp3/core/vpConfig.h>
18
19#if defined(VISP_HAVE_TENSORRT) && defined(VISP_HAVE_OPENCV)
20
21#if defined(HAVE_OPENCV_CUDEV) && defined(HAVE_OPENCV_CUDAWARPING) && defined(HAVE_OPENCV_CUDAARITHM) && \
22 defined(HAVE_OPENCV_DNN) && defined(HAVE_OPENCV_VIDEOIO)
23#include <visp3/core/vpImageConvert.h>
24#include <visp3/core/vpIoTools.h>
25#include <visp3/gui/vpDisplayFactory.h>
26
27#include <opencv2/videoio.hpp>
28
30#include <opencv2/core/cuda.hpp>
31#include <opencv2/cudaarithm.hpp>
32#include <opencv2/cudawarping.hpp>
33#include <opencv2/dnn.hpp>
35
37#include <cuda_runtime_api.h>
39
41#include <NvInfer.h>
42#include <NvOnnxParser.h>
44
45#include <sys/stat.h>
46
47#ifdef ENABLE_VISP_NAMESPACE
48using namespace VISP_NAMESPACE_NAME;
49#endif
50
52void preprocessImage(cv::Mat &img, float *gpu_input, const nvinfer1::Dims &dims, float meanR, float meanG, float meanB)
53{
54 if (img.empty()) {
55 std::cerr << "Image is empty." << std::endl;
56 return;
57 }
58
59 cv::cuda::GpuMat gpu_frame;
60 // Upload image to GPU
61 gpu_frame.upload(img);
62
63 // input_dims is in NxCxHxW format.
64 auto input_width = dims.d[3];
65 auto input_height = dims.d[2];
66 auto channels = dims.d[1];
67 auto input_size = cv::Size(input_width, input_height);
68
69 // Resize
70 cv::cuda::GpuMat resized;
71 cv::cuda::resize(gpu_frame, resized, input_size, 0, 0, cv::INTER_NEAREST);
72
73 // Normalize
74 cv::cuda::GpuMat flt_image;
75 resized.convertTo(flt_image, CV_32FC3);
76 cv::cuda::subtract(flt_image, cv::Scalar(meanR, meanG, meanB), flt_image, cv::noArray(), -1);
77 cv::cuda::divide(flt_image, cv::Scalar(127.5f, 127.5f, 127.5f), flt_image, 1, -1);
78
79 // To tensor
80 std::vector<cv::cuda::GpuMat> chw;
81 for (int i = 0; i < channels; ++i)
82 chw.emplace_back(cv::cuda::GpuMat(input_size, CV_32FC1, gpu_input + i * input_width * input_height));
83 cv::cuda::split(flt_image, chw);
84}
86
88size_t getSizeByDim(const nvinfer1::Dims &dims)
89{
90 size_t size = 1;
91 for (int i = 0; i < dims.nbDims; ++i)
92 size *= dims.d[i];
93 return size;
94}
96
98std::vector<cv::Rect> postprocessResults(std::vector<void *> buffers, const std::vector<nvinfer1::Dims> &output_dims,
99 int batch_size, int image_width, int image_height, float confThresh,
100 float nmsThresh, std::vector<int> &classIds)
101{
102 // private variables of vpDetectorDNN
103 std::vector<cv::Rect> m_boxes, m_boxesNMS;
104 std::vector<int> m_classIds;
105 std::vector<float> m_confidences;
106 std::vector<int> m_indices;
107
108 // copy results from GPU to CPU
109 std::vector<std::vector<float> > cpu_outputs;
110 for (size_t i = 0; i < output_dims.size(); i++) {
111 cpu_outputs.push_back(std::vector<float>(getSizeByDim(output_dims[i]) * batch_size));
112 cudaMemcpy(cpu_outputs[i].data(), (float *)buffers[1 + i], cpu_outputs[i].size() * sizeof(float),
113 cudaMemcpyDeviceToHost);
114 }
115
116 // post process
117 int N = output_dims[0].d[1], C = output_dims[0].d[2]; // (1 x N x C format); N: Number of output detection boxes
118 // (fixed in the model), C: Number of classes.
119 for (int i = 0; i < N; i++) // for all N (boxes)
120 {
121 uint32_t maxClass = 0;
122 float maxScore = -1000.0f;
123
124 for (int j = 1; j < C; j++) // ignore background (classId = 0).
125 {
126 const float score = cpu_outputs[0][i * C + j];
127
128 if (score < confThresh)
129 continue;
130
131 if (score > maxScore) {
132 maxScore = score;
133 maxClass = j;
134 }
135 }
136
137 if (maxScore > confThresh) {
138 int left = static_cast<int>(cpu_outputs[1][4 * i] * image_width);
139 int top = static_cast<int>(cpu_outputs[1][4 * i + 1] * image_height);
140 int right = static_cast<int>(cpu_outputs[1][4 * i + 2] * image_width);
141 int bottom = static_cast<int>(cpu_outputs[1][4 * i + 3] * image_height);
142 int width = right - left + 1;
143 int height = bottom - top + 1;
144
145 m_boxes.push_back(cv::Rect(left, top, width, height));
146 m_classIds.push_back(maxClass);
147 m_confidences.push_back(maxScore);
148 }
149 }
150
151 cv::dnn::NMSBoxes(m_boxes, m_confidences, confThresh, nmsThresh, m_indices);
152 m_boxesNMS.resize(m_indices.size());
153 for (size_t i = 0; i < m_indices.size(); ++i) {
154 int idx = m_indices[i];
155 m_boxesNMS[i] = m_boxes[idx];
156 }
157
158 classIds = m_classIds; // Returning detected objects class Ids.
159 return m_boxesNMS;
160}
162
163class Logger : public nvinfer1::ILogger
164{
165public:
166 void log(Severity severity, const char *msg) noexcept // override
167 {
168 if ((severity == Severity::kERROR) || (severity == Severity::kINTERNAL_ERROR) || (severity == Severity::kVERBOSE))
169 std::cout << msg << std::endl;
170 }
171} gLogger;
172
173// destroy TensoRT objects if something goes wrong
174struct TRTDestroy
175{
176 template <class T> void operator()(T *obj) const
177 {
178 if (obj)
179 obj->destroy();
180 }
181};
182
183template <class T> using TRTUniquePtr = std::unique_ptr<T, TRTDestroy>;
184
186bool parseOnnxModel(const std::string &model_path, TRTUniquePtr<nvinfer1::ICudaEngine> &engine,
187 TRTUniquePtr<nvinfer1::IExecutionContext> &context)
189{
190 // this section of code is from jetson-inference's `tensorNet`, to test if the GIE already exists.
191 char cache_prefix[FILENAME_MAX];
192 char cache_path[FILENAME_MAX];
193
194 snprintf(cache_prefix, FILENAME_MAX, "%s", model_path.c_str());
195 snprintf(cache_path, FILENAME_MAX, "%s.engine", cache_prefix);
196
197 std::cout << "attempting to open engine cache file " << cache_path << std::endl;
198
200 if (vpIoTools::checkFilename(cache_path)) {
201 char *engineStream = nullptr;
202 size_t engineSize = 0;
203
204 // determine the file size of the engine
205 struct stat filestat;
206 stat(cache_path, &filestat);
207 engineSize = filestat.st_size;
208
209 // allocate memory to hold the engine
210 engineStream = (char *)malloc(engineSize);
211
212 // open the engine cache file from disk
213 FILE *cacheFile = nullptr;
214 cacheFile = fopen(cache_path, "rb");
215
216 // read the serialized engine into memory
217 const size_t bytesRead = fread(engineStream, 1, engineSize, cacheFile);
218
219 if (bytesRead != engineSize) // Problem while deserializing.
220 {
221 std::cerr << "Error reading serialized engine into memory." << std::endl;
222 return false;
223 }
224
225 // close the plan cache
226 fclose(cacheFile);
227
228 // Recreate the inference runtime
229 TRTUniquePtr<nvinfer1::IRuntime> infer { nvinfer1::createInferRuntime(gLogger) };
230 engine.reset(infer->deserializeCudaEngine(engineStream, engineSize, nullptr));
231 context.reset(engine->createExecutionContext());
232
233 return true;
234 }
236
238 else {
239 if (!vpIoTools::checkFilename(model_path)) {
240 std::cerr << "Could not parse ONNX model. File not found" << std::endl;
241 return false;
242 }
243
244 TRTUniquePtr<nvinfer1::IBuilder> builder { nvinfer1::createInferBuilder(gLogger) };
245 TRTUniquePtr<nvinfer1::INetworkDefinition> network {
246 builder->createNetworkV2(1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH)) };
247 TRTUniquePtr<nvonnxparser::IParser> parser { nvonnxparser::createParser(*network, gLogger) };
248
249 // parse ONNX
250 if (!parser->parseFromFile(model_path.c_str(), static_cast<int>(nvinfer1::ILogger::Severity::kINFO))) {
251 std::cerr << "ERROR: could not parse the model." << std::endl;
252 return false;
253 }
254
255 TRTUniquePtr<nvinfer1::IBuilderConfig> config { builder->createBuilderConfig() };
256 // allow TRT to use up to 1GB of GPU memory for tactic selection
257 config->setMaxWorkspaceSize(32 << 20);
258 // use FP16 mode if possible
259 if (builder->platformHasFastFp16()) {
260 config->setFlag(nvinfer1::BuilderFlag::kFP16);
261 }
262
263 builder->setMaxBatchSize(1);
264
265 engine.reset(builder->buildEngineWithConfig(*network, *config));
266 context.reset(engine->createExecutionContext());
267
268 TRTUniquePtr<nvinfer1::IHostMemory> serMem { engine->serialize() };
269
270 if (!serMem) {
271 std::cout << "Failed to serialize CUDA engine." << std::endl;
272 return false;
273 }
274
275 const char *serData = (char *)serMem->data();
276 const size_t serSize = serMem->size();
277
278 // allocate memory to store the bitstream
279 char *engineMemory = (char *)malloc(serSize);
280
281 if (!engineMemory) {
282 std::cout << "Failed to allocate memory to store CUDA engine." << std::endl;
283 return false;
284 }
285
286 memcpy(engineMemory, serData, serSize);
287
288 // write the cache file
289 FILE *cacheFile = nullptr;
290 cacheFile = fopen(cache_path, "wb");
291
292 fwrite(engineMemory, 1, serSize, cacheFile);
293 fclose(cacheFile);
294
295 return true;
296 }
298}
299
300int main(int argc, char **argv)
301{
302 int opt_device = 0;
303 unsigned int opt_scale = 1;
304 std::string input = "";
305 std::string modelFile = vpIoTools::getViSPImagesDataPath() + "/dnn/object_detection/ssd-mobilenet.onnx";
306 std::string labelFile = vpIoTools::getViSPImagesDataPath() + "/dnn/object_detection/pascal-voc-labels.txt";
307 std::string config = "";
308 float meanR = 127.5f, meanG = 127.5f, meanB = 127.5f;
309 float confThresh = 0.5f;
310 float nmsThresh = 0.4f;
311
312 for (int i = 1; i < argc; i++) {
313 if (std::string(argv[i]) == "--device" && i + 1 < argc) {
314 opt_device = atoi(argv[++i]);
315 }
316 else if (std::string(argv[i]) == "--input" && i + 1 < argc) {
317 input = std::string(argv[++i]);
318 }
319 else if (std::string(argv[i]) == "--model" && i + 1 < argc) {
320 modelFile = std::string(argv[++i]);
321 }
322 else if (std::string(argv[i]) == "--config" && i + 1 < argc) {
323 config = std::string(argv[++i]);
324 }
325 else if (std::string(argv[i]) == "--input-scale" && i + 1 < argc) {
326 opt_scale = atoi(argv[++i]);
327 }
328 else if (std::string(argv[i]) == "--mean" && i + 3 < argc) {
329 meanR = atof(argv[++i]);
330 meanG = atof(argv[++i]);
331 meanB = atof(argv[++i]);
332 }
333 else if (std::string(argv[i]) == "--confThresh" && i + 1 < argc) {
334 confThresh = static_cast<float>(atof(argv[++i]));
335 }
336 else if (std::string(argv[i]) == "--nmsThresh" && i + 1 < argc) {
337 nmsThresh = static_cast<float>(atof(argv[++i]));
338 }
339 else if (std::string(argv[i]) == "--labels" && i + 1 < argc) {
340 labelFile = std::string(argv[++i]);
341 }
342 else if (std::string(argv[i]) == "--help" || std::string(argv[i]) == "-h") {
343 std::cout << argv[0]
344 << " [--device <camera device number>] [--input <path to image or video>"
345 " (camera is used if input is empty)] [--model <path to net trained weights>]"
346 " [--config <path to net config file>]"
347 " [--input-scale <input scale factor>] [--mean <meanR meanG meanB>]"
348 " [--confThresh <confidence threshold>]"
349 " [--nmsThresh <NMS threshold>] [--labels <path to label file>]"
350 << std::endl;
351 return EXIT_SUCCESS;
352 }
353 }
354
355 std::string model_path(modelFile);
356 int batch_size = 1;
357
358 std::vector<std::string> labels;
359 if (!labelFile.empty()) {
360 std::ifstream f_label(labelFile);
361 std::string line;
362 while (std::getline(f_label, line)) {
363 labels.push_back(line);
364 }
365 }
366
368 // Parse the model and initialize the engine and the context.
369 TRTUniquePtr<nvinfer1::ICudaEngine> engine { nullptr };
370 TRTUniquePtr<nvinfer1::IExecutionContext> context { nullptr };
371 if (!parseOnnxModel(model_path, engine, context)) // Problem parsing Onnx model
372 {
373 std::cout << "Make sure the model file exists. To see available models, plese visit: "
374 "\n\twww.github.com/lagadic/visp-images/dnn/object_detection/"
375 << std::endl;
376 return EXIT_FAILURE;
377 }
379
380 std::vector<nvinfer1::Dims> input_dims;
381 std::vector<nvinfer1::Dims> output_dims;
382 std::vector<void *> buffers(engine->getNbBindings()); // buffers for input and output data.
383
385 for (int i = 0; i < engine->getNbBindings(); ++i) {
386 auto binding_size = getSizeByDim(engine->getBindingDimensions(i)) * batch_size * sizeof(float);
387 cudaMalloc(&buffers[i], binding_size);
388
389 if (engine->bindingIsInput(i)) {
390 input_dims.emplace_back(engine->getBindingDimensions(i));
391 }
392 else {
393 output_dims.emplace_back(engine->getBindingDimensions(i));
394 }
395 }
396
397 if (input_dims.empty() || output_dims.empty()) {
398 std::cerr << "Expect at least one input and one output for network" << std::endl;
399 return EXIT_FAILURE;
400 }
402
404 cv::VideoCapture capture;
405
406 if (input.empty()) {
407 capture.open(opt_device);
408 }
409 else {
410 capture.open(input);
411 }
412
413 if (!capture.isOpened()) { // check if we succeeded
414 std::cout << "Failed to open the camera" << std::endl;
415 return EXIT_FAILURE;
416 }
417
418 int cap_width = static_cast<int>(capture.get(cv::CAP_PROP_FRAME_WIDTH));
419 int cap_height = static_cast<int>(capture.get(cv::CAP_PROP_FRAME_HEIGHT));
420 capture.set(cv::CAP_PROP_FRAME_WIDTH, cap_width / opt_scale);
421 capture.set(cv::CAP_PROP_FRAME_HEIGHT, cap_height / opt_scale);
423
425 cv::Mat frame;
426 capture >> frame;
427
428 if (input.empty()) {
429 int i = 0;
430 while ((i++ < 20) && !capture.read(frame)) {
431 } // warm up camera by skiping unread frames
432 }
433
434 vpImageConvert::convert(frame, I);
435 int height = I.getHeight(), width = I.getWidth();
436
437 std::cout << "Image size: " << width << " x " << height << std::endl;
438
439 std::vector<cv::Rect> boxesNMS;
440 std::vector<int> classIds;
441
442#if (VISP_CXX_STANDARD >= VISP_CXX_STANDARD_11)
443 std::shared_ptr<vpDisplay> display = vpDisplayFactory::createDisplay(I);
444#else
446#endif
447
448 double start, stop;
450 while (!vpDisplay::getClick(I, false)) {
451 // get frame.
452 capture >> frame;
453
454 vpImageConvert::convert(frame, I);
455
457 // preprocess
458 preprocessImage(frame, (float *)buffers[0], input_dims[0], meanR, meanG, meanB);
459
460 // inference.
461 context->enqueue(batch_size, buffers.data(), 0, nullptr);
462
463 // post-process
464 boxesNMS = postprocessResults(buffers, output_dims, batch_size, width, height, confThresh, nmsThresh, classIds);
465
466 stop = vpTime::measureTimeMs();
467 // display.
469 vpDisplay::displayText(I, 10, 10, std::to_string(stop - start), vpColor::red);
470
471 for (unsigned int i = 0; i < boxesNMS.size(); i++) {
472 vpDisplay::displayRectangle(I, vpRect(boxesNMS[i].x, boxesNMS[i].y, boxesNMS[i].width, boxesNMS[i].height),
473 vpColor::red, false, 2);
474 vpDisplay::displayText(I, boxesNMS[i].y - 10, boxesNMS[i].x, labels[classIds[i]], vpColor::red);
475 }
476
478 }
480
481 for (void *buf : buffers)
482 cudaFree(buf);
483
484#if (VISP_CXX_STANDARD < VISP_CXX_STANDARD_11)
485 if (display != nullptr) {
486 delete display;
487 }
488#endif
489
490 return EXIT_SUCCESS;
491}
492#else
493int main()
494{
495 std::cout << "OpenCV is not built with CUDA." << std::endl;
496
497 return EXIT_SUCCESS;
498}
499#endif
500
501#else
502int main()
503{
504 std::cout << "ViSP is not built with TensorRT." << std::endl;
505
506 return EXIT_SUCCESS;
507}
508#endif
static const vpColor red
Definition vpColor.h:198
Class that defines generic functionalities for display.
Definition vpDisplay.h:171
static bool getClick(const vpImage< unsigned char > &I, bool blocking=true)
static void display(const vpImage< unsigned char > &I)
static void flush(const vpImage< unsigned char > &I)
static void displayRectangle(const vpImage< unsigned char > &I, const vpImagePoint &topLeft, unsigned int width, unsigned int height, const vpColor &color, bool fill=false, unsigned int thickness=1)
static void displayText(const vpImage< unsigned char > &I, const vpImagePoint &ip, const std::string &s, const vpColor &color)
static void convert(const vpImage< unsigned char > &src, vpImage< vpRGBa > &dest)
Definition of the vpImage class member functions.
Definition vpImage.h:131
static std::string getViSPImagesDataPath()
static bool checkFilename(const std::string &filename)
Defines a rectangle in the plane.
Definition vpRect.h:79
std::shared_ptr< vpDisplay > createDisplay()
Return a smart pointer vpDisplay specialization if a GUI library is available or nullptr otherwise.
vpDisplay * allocateDisplay()
Return a newly allocated vpDisplay specialization if a GUI library is available or nullptr otherwise.
VISP_EXPORT double measureTimeMs()