34#include <visp3/core/vpConfig.h>
37#if defined(VISP_HAVE_OPENCV) && (VISP_HAVE_OPENCV_VERSION >= 0x030403) && defined(HAVE_OPENCV_DNN) && \
38 ((__cplusplus >= 201703L) || (defined(_MSVC_LANG) && (_MSVC_LANG >= 201703L)))
40#include <visp3/core/vpImageConvert.h>
41#include <visp3/detection/vpDetectorDNNOpenCV.h>
42#include <visp3/core/vpIoTools.h>
54 std::string list =
"[";
100 name =
"ssd-mobilenet";
106 name =
"user-specified";
126 bool hasFoundMatch =
false;
128 for (
int id = 0;
id <
COUNT && !hasFoundMatch;
id++) {
132 hasFoundMatch =
true;
150 return NetConfig::parseClassNamesFile(filename);
179#ifdef VISP_HAVE_NLOHMANN_JSON
181using json = nlohmann::json;
205 std::ifstream file(jsonPath);
207 std::stringstream ss;
208 ss <<
"Problem opening file " << jsonPath <<
". Make sure it exists and is readable" << std::endl;
213 j = json::parse(file);
215 catch (json::parse_error &e) {
216 std::stringstream msg;
217 msg <<
"Could not parse JSON file : \n";
219 msg << e.what() << std::endl;
220 msg <<
"Byte position of error: " << e.byte;
235 std::ofstream file(jsonPath);
236 const json j = *
this;
362 catch (
const cv::Exception &e) {
363 std::cerr <<
"Caught an exception trying to run inference:" << std::endl <<
"\t"
365 <<
"\nCuda and/or GPU driver might not be correctly installed. Setting preferable backend to CPU and trying again." << std::endl;
366 m_net.setPreferableBackend(cv::dnn::DNN_BACKEND_DEFAULT);
367 m_net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
373 size_t nbClassNames =
m_netConfig.m_classNames.size();
374 for (
size_t i = 0; i <
m_indices.size(); ++i) {
376 cv::Rect box = proposals.
m_boxes[idx];
377 std::optional<std::string> classname_opt;
378 if (nbClassNames > 0) {
381 output.emplace_back(box.x, box.x + box.width, box.y, box.y + box.height
392 return !output.empty();
415 catch (
const cv::Exception &e) {
416 std::cerr <<
"Caught an exception trying to run inference:" << std::endl <<
"\t"
418 <<
"\nCuda and/or GPU driver might not be correctly installed. Setting preferable backend to CPU and trying again." << std::endl;
419 m_net.setPreferableBackend(cv::dnn::DNN_BACKEND_DEFAULT);
420 m_net.setPreferableTarget(cv::dnn::DNN_TARGET_CPU);
426 size_t nbClassNames =
m_netConfig.m_classNames.size();
427 for (
size_t i = 0; i <
m_indices.size(); ++i) {
429 cv::Rect box = proposals.
m_boxes[idx];
430 std::string classname;
431 if (nbClassNames > 0) {
435 classname = std::to_string(proposals.
m_classIds[idx]);
437 std::optional<std::string> classname_opt = std::optional<std::string>(classname);
438 output[classname].emplace_back(box.x, box.x + box.width, box.y, box.y + box.height
448 return !output.empty();
460 std::map< std::string, std::vector<DetectedFeatures2D>> map_output;
461 bool returnStatus =
detect(I, map_output);
462 for (
auto key_val : map_output) {
463 output.push_back(key_val);
468#if (VISP_HAVE_OPENCV_VERSION == 0x030403)
476 static std::vector<cv::String> names;
478 std::vector<int> outLayers =
m_net.getUnconnectedOutLayers();
479 std::vector<cv::String> layersNames =
m_net.getLayerNames();
480 names.resize(outLayers.size());
481 for (
size_t i = 0; i < outLayers.size(); ++i)
482 names[i] = layersNames[outLayers[i] - 1];
516#if defined(VISP_BUILD_DEPRECATED_FUNCTIONS)
549std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>
553 double originalNumberOfObj =
static_cast<double>(detected_features.size());
554 double meanFactor = 1. / originalNumberOfObj;
558 meanArea += feature.m_bbox.getArea();
560 meanArea *= meanFactor;
563 std::vector<DetectedFeatures2D> filtered_features;
565 if (feature.m_bbox.getArea() >= minRatioOfAreaOk * meanArea && feature.m_bbox.getArea() < meanArea / minRatioOfAreaOk) {
566 filtered_features.push_back(feature);
570 return filtered_features;
585std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>
588#ifndef DOXYGEN_SHOULD_SKIP_THIS
593 class MeanAreaComputer
596 std::map<int, std::pair<int, double>> m_map_id_pairOccurrencesAreas;
599 std::map<int, double> m_mapMeans;
606 double computeMeanArea(
const int &class_id)
608 return m_map_id_pairOccurrencesAreas[class_id].second /
static_cast<double>(m_map_id_pairOccurrencesAreas[class_id].first);
617 for (
const auto &classID_pair : m_map_id_pairOccurrencesAreas) {
618 m_mapMeans[classID_pair.first] = computeMeanArea(classID_pair.first);
622 double getMean(
const int &class_id)
624 if (m_map_id_pairOccurrencesAreas.find(class_id) == m_map_id_pairOccurrencesAreas.end()) {
625 throw(
vpException(
vpException::badValue,
"[MeanAreaComputer::getMean] Asking for class_id \"" + std::to_string(class_id) +
"\" that is not present in m_mapMeans. Did you call computeMeans ?"));
627 return m_mapMeans[class_id];
639 if (m_map_id_pairOccurrencesAreas.find(class_id) == m_map_id_pairOccurrencesAreas.end()) {
640 m_map_id_pairOccurrencesAreas[class_id] = std::pair<int, double>(1, area);
643 std::pair<int, double> prev_state = m_map_id_pairOccurrencesAreas[class_id];
644 m_map_id_pairOccurrencesAreas[class_id] = std::pair<int, double>(prev_state.first + 1, prev_state.second + area);
651 MeanAreaComputer meanComputer;
652 std::for_each(detected_features.begin(), detected_features.end(), meanComputer);
653 meanComputer.computeMeans();
656 std::vector<DetectedFeatures2D> filtered_features;
658 double meanArea = meanComputer.getMean(feature.getClassId());
659 if (feature.m_bbox.getArea() >= minRatioOfAreaOk * meanArea
660 && feature.m_bbox.getArea() < meanArea / minRatioOfAreaOk) {
661 filtered_features.push_back(feature);
665 return filtered_features;
677std::map<std::string, std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>>
680 std::map<std::string, std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D>> output;
681 for (
auto keyval : detected_features) {
702 size_t nbBatches = dnnRes.size();
704 for (
size_t i = 0; i < nbBatches; i++) {
707 int num_proposal = dnnRes[i].size[0];
708 int nout = dnnRes[i].size[1];
709 if (dnnRes[i].dims > 2) {
710 num_proposal = dnnRes[i].size[1];
711 nout = dnnRes[i].size[2];
712 dnnRes[i] = dnnRes[i].reshape(0, num_proposal);
715 int n = 0, row_ind = 0;
716 float *pdata = (
float *)dnnRes[i].data;
719 for (n = 0; n < num_proposal; n++) {
720 float box_score = pdata[4];
721 if (box_score > netConfig.m_confThreshold) {
722 cv::Mat scores = dnnRes[i].row(row_ind).colRange(5, nout);
723 cv::Point classIdPoint;
724 double max_class_score;
726 cv::minMaxLoc(scores, 0, &max_class_score, 0, &classIdPoint);
728 max_class_score *= box_score;
731 if (max_class_score > netConfig.m_confThreshold) {
732 const int class_idx = classIdPoint.x;
733 float cx = pdata[0] *
m_img.cols;
734 float cy = pdata[1] *
m_img.rows;
735 float w = pdata[2] *
m_img.cols;
736 float h = pdata[3] *
m_img.rows;
738 int left = int(cx - 0.5 * w);
739 int top = int(cy - 0.5 * h);
741 proposals.
m_confidences.push_back(
static_cast<float>(max_class_score));
742 proposals.
m_boxes.push_back(cv::Rect(left, top,
static_cast<int>(w),
static_cast<int>(h)));
767 float ratioh =
static_cast<float>(
m_img.rows) / netConfig.m_inputSize.height, ratiow =
static_cast<float>(
m_img.cols) / netConfig.m_inputSize.width;
768 size_t nbBatches = dnnRes.size();
770 for (
size_t i = 0; i < nbBatches; i++) {
772 int num_proposal = dnnRes[i].size[0];
773 int nout = dnnRes[i].size[1];
774 if (dnnRes[i].dims > 2) {
775 num_proposal = dnnRes[i].size[1];
776 nout = dnnRes[i].size[2];
777 dnnRes[i] = dnnRes[i].reshape(0, num_proposal);
780 int n = 0, row_ind = 0;
781 float *pdata = (
float *)dnnRes[i].data;
784 for (n = 0; n < num_proposal; n++) {
785 float box_score = pdata[4];
787 if (box_score > netConfig.m_confThreshold) {
788 cv::Mat scores = dnnRes[i].row(row_ind).colRange(5, nout);
789 cv::Point classIdPoint;
790 double max_class_score;
792 cv::minMaxLoc(scores, 0, &max_class_score, 0, &classIdPoint);
793 max_class_score *= box_score;
796 if (max_class_score > netConfig.m_confThreshold) {
797 const int class_idx = classIdPoint.x;
798 float cx = pdata[0] * ratiow;
799 float cy = pdata[1] * ratioh;
800 float w = pdata[2] * ratiow;
801 float h = pdata[3] * ratioh;
803 int left = int(cx - 0.5 * w);
804 int top = int(cy - 0.5 * h);
806 proposals.
m_confidences.push_back(
static_cast<float>(max_class_score));
807 proposals.
m_boxes.push_back(cv::Rect(left, top,
static_cast<int>(w),
static_cast<int>(h)));
833 float ratioh =
static_cast<float>(
m_img.rows) / netConfig.m_inputSize.height, ratiow =
static_cast<float>(
m_img.cols) / netConfig.m_inputSize.width;
834 size_t nbBatches = dnnRes.size();
836 for (
size_t i = 0; i < nbBatches; i++) {
838 int num_proposal = dnnRes[i].size[1];
839 int nout = dnnRes[i].size[0];
840 if (dnnRes[i].dims > 2) {
841 num_proposal = dnnRes[i].size[2];
842 nout = dnnRes[i].size[1];
843 dnnRes[i] = dnnRes[i].reshape(0, nout);
845 cv::transpose(dnnRes[i], dnnRes[i]);
847 int n = 0, row_ind = 0;
848 float *pdata = (
float *)dnnRes[i].data;
851 for (n = 0; n < num_proposal; n++) {
852 cv::Mat scores = dnnRes[i].row(row_ind).colRange(4, nout);
853 cv::Point classIdPoint;
854 double max_class_score;
856 cv::minMaxLoc(scores, 0, &max_class_score, 0, &classIdPoint);
859 if (max_class_score > netConfig.m_confThreshold) {
860 const int class_idx = classIdPoint.x;
861 float cx = pdata[0] * ratiow;
862 float cy = pdata[1] * ratioh;
863 float w = pdata[2] * ratiow;
864 float h = pdata[3] * ratioh;
866 int left = int(cx - 0.5 * w);
867 int top = int(cy - 0.5 * h);
869 proposals.
m_confidences.push_back(
static_cast<float>(max_class_score));
870 proposals.
m_boxes.push_back(cv::Rect(left, top,
static_cast<int>(w),
static_cast<int>(h)));
899 size_t nbBatches = dnnRes.size();
900 for (
size_t j = 0; j < nbBatches; j++) {
901 float *data = (
float *)dnnRes[j].data;
902 for (
size_t i = 0; i < dnnRes[j].total(); i += 7) {
903 float confidence = data[i + 2];
904 if (confidence > netConfig.m_confThreshold) {
905 int left =
static_cast<int>(data[i + 3] *
m_img.cols);
906 int top =
static_cast<int>(data[i + 4] *
m_img.rows);
907 int right =
static_cast<int>(data[i + 5] *
m_img.cols);
908 int bottom =
static_cast<int>(data[i + 6] *
m_img.rows);
909 int classId =
static_cast<int>(data[i + 1]);
911 proposals.
m_confidences.push_back(
static_cast<float>(confidence));
912 proposals.
m_boxes.push_back(cv::Rect(left, top, right - left + 1, bottom - top + 1));
920#if defined(VISP_BUILD_DEPRECATED_FUNCTIONS)
940 int scores_index =
m_outNames[0] ==
"scores" ? 0 : 1;
941 int boxes_index =
m_outNames[0] ==
"boxes" ? 0 : 1;
943 int N = dnnRes[scores_index].size[1], C = dnnRes[scores_index].size[2];
945 float *confidence = (
float *)dnnRes[scores_index].data;
946 float *bbox = (
float *)dnnRes[boxes_index].data;
949 for (
int i = 0; i < N; i++) {
950 uint32_t maxClass = 0;
951 float maxScore = -1000.0f;
953 for (
int j = 1; j < C; j++)
955 const float score = confidence[i * C + j];
957 if (score < netConfig.m_confThreshold)
960 if (score > maxScore) {
966 if (maxScore > netConfig.m_confThreshold) {
967 int left =
static_cast<int>(bbox[4 * i] *
m_img.cols);
968 int top =
static_cast<int>(bbox[4 * i + 1] *
m_img.rows);
969 int right =
static_cast<int>(bbox[4 * i + 2] *
m_img.cols);
970 int bottom =
static_cast<int>(bbox[4 * i + 3] *
m_img.rows);
971 int width = right - left + 1;
972 int height = bottom - top + 1;
974 int classId = maxClass;
976 proposals.
m_boxes.push_back(cv::Rect(left, top, width, height));
1001 CV_Assert(dnnRes.size() == 1);
1002 float *data = (
float *)dnnRes[0].data;
1003 for (
size_t i = 0; i < dnnRes[0].total(); i += 7) {
1004 float confidence = data[i + 2];
1005 if (confidence > netConfig.m_confThreshold) {
1006 int left =
static_cast<int>(data[i + 3] *
m_img.cols);
1007 int top =
static_cast<int>(data[i + 4] *
m_img.rows);
1008 int right =
static_cast<int>(data[i + 5] *
m_img.cols);
1009 int bottom =
static_cast<int>(data[i + 6] *
m_img.rows);
1010 int classId =
static_cast<int>(data[i + 1]) - 1;
1012 proposals.
m_confidences.push_back(
static_cast<float>(confidence));
1013 proposals.
m_boxes.push_back(cv::Rect(left, top, right - left + 1, bottom - top + 1));
1059 m_net = cv::dnn::readNet(model, config, framework);
1060#if (VISP_HAVE_OPENCV_VERSION == 0x030403)
1108 if (
m_netConfig.m_filterSizeRatio > std::numeric_limits<double>::epsilon()) {
1160 std::cout <<
"[vpDetectorDNNOpenCV::setParsingMethod] WARNING: scale factor should be 1/255. to normalize pixels value." << std::endl;
1180 m_netConfig.m_parsingMethodType = typeParsingMethod;
1184 std::cout <<
"[vpDetectorDNNOpenCV::setParsingMethod] NB: scale factor changed to 1/255. to normalize pixels value." << std::endl;
1187#if defined(VISP_BUILD_DEPRECATED_FUNCTIONS)
1189 std::cout <<
"[vpDetectorDNNOpenCV::setParsingMethod] WARNING: The chosen type of network is " <<
dnnResultsParsingTypeToString(
m_netConfig.m_parsingMethodType) <<
" VISP_BUILD_DEPRECATED_FUNCTIONS is set to true." << std::endl;
1190 std::cout <<
"\tThe parsing method that worked with the networks quoted in the ViSP documentation was postProcess_ResNet_10 instead of postProcess_SSD_MobileNet." << std::endl;
1191 std::cout <<
"\tIf the SSD-MobileNet network does not seem to work, please try to recompile ViSP setting VISP_BUILD_DEPRECATED_FUNCTIONS as false." << std::endl << std::flush;
1197#elif !defined(VISP_BUILD_SHARED_LIBS)
1199void dummy_vpDetectorDNN() { }
Structure containing the bounding box, expressed in pixels, confidence and class information about an...
vpRect getBoundingBox() const
unsigned int getClassId() const
Structure containing some information required for the configuration of a vpDetectorDNNOpenCV object.
cv::Mat m_blob
Buffer for the blob in input net.
void postProcess_YoloV5_V7(DetectionCandidates &proposals, std::vector< cv::Mat > &dnnRes, const NetConfig &netConfig)
void setScaleFactor(const double &scaleFactor)
void initFromJSON(const std::string &jsonPath)
Initialize detector from a json config file.
void readNet(const std::string &model, const std::string &config="", const std::string &framework="")
static void postProcess_unimplemented(DetectionCandidates &proposals, std::vector< cv::Mat > &dnnRes, const NetConfig &netConfig)
void setDetectionFilterSizeRatio(const double &sizeRatio)
DNNResultsParsingType
Enumeration listing the types of DNN for which the vpDetectorDNNOpenCV furnishes the methods permitti...
static DNNResultsParsingType dnnResultsParsingTypeFromString(const std::string &name)
void postProcess_SSD_MobileNet(DetectionCandidates &proposals, std::vector< cv::Mat > &dnnRes, const NetConfig &netConfig)
std::vector< cv::String > m_outNames
Names of layers with unconnected outputs.
void setMean(const double &meanR, const double &meanG, const double &meanB)
void setSwapRB(const bool &swapRB)
cv::Mat m_img
Buffer for the input image.
static std::vector< std::string > parseClassNamesFile(const std::string &filename)
Parse the designated file that contains the list of the classes the network can detect....
std::vector< int > m_indices
Indices for NMS.
void setParsingMethod(const DNNResultsParsingType &typeParsingMethod, void(*parsingMethod)(DetectionCandidates &, std::vector< cv::Mat > &, const NetConfig &)=postProcess_unimplemented)
NetConfig m_netConfig
Configuration of the DNN.
std::vector< cv::Mat > m_dnnRes
Contains all output blobs for each layer specified in m_outNames.
cv::dnn::Net m_net
DNN network.
bool m_applySizeFilterAfterNMS
If true, filter the detections removing the ones for which the bbox does not respect area(bbox) € [me...
std::vector< cv::String > getOutputsNames()
Get the names of the output layers of the DNN.
void setNetConfig(const NetConfig &config)
void postProcess_YoloV3_V4(DetectionCandidates &proposals, std::vector< cv::Mat > &dnnRes, const NetConfig &netConfig)
virtual bool detect(const vpImage< unsigned char > &I, std::vector< DetectedFeatures2D > &output)
Object detection using OpenCV DNN module.
void postProcess_ResNet_10(DetectionCandidates &proposals, std::vector< cv::Mat > &dnnRes, const NetConfig &netConfig)
void setPreferableBackend(const int &backendId)
void setNMSThreshold(const float &nmsThreshold)
virtual ~vpDetectorDNNOpenCV()
Destroy the vpDetectorDNNOpenCV object.
void postProcess_FasterRCNN(DetectionCandidates &proposals, std::vector< cv::Mat > &dnnRes, const NetConfig &netConfig)
std::vector< DetectedFeatures2D > filterDetectionMultiClassInput(const std::vector< DetectedFeatures2D > &detected_features, const double minRatioOfAreaOk)
Return a new vector, ordered by vpDetectorDNNOpenCV::DetectedFeatures2D::m_cls , where the area of ea...
void postProcess_YoloV8_V11_V12(DetectionCandidates &proposals, std::vector< cv::Mat > &dnnRes, const NetConfig &netConfig)
void setPreferableTarget(const int &targetId)
void setInputSize(const int &width, const int &height)
static std::string dnnResultsParsingTypeToString(const DNNResultsParsingType &type)
void postProcess(DetectionCandidates &proposals)
static std::string getAvailableDnnResultsParsingTypes()
Get the list of the parsing methods / types of DNNs supported by the vpDetectorDNNOpenCV class.
void(* m_parsingMethod)(DetectionCandidates &, std::vector< cv::Mat > &, const NetConfig &)
Pointer towards the parsing method, used if m_parsingMethodType is equal to m_parsingMethodType::USER...
std::vector< DetectedFeatures2D > filterDetectionSingleClassInput(const std::vector< DetectedFeatures2D > &detected_features, const double minRatioOfAreaOk)
Return a new vector of detected features whose area is greater or equal to the average area x minRati...
void saveConfigurationInJSON(const std::string &jsonPath) const
Save the network configuration in a JSON file.
void setConfidenceThreshold(const float &confThreshold)
vpImage< vpRGBa > m_I_color
Buffer for gray to RGBa image conversion.
error that can be emitted by ViSP classes.
@ badValue
Used to indicate that a value is not in the allowed range.
@ functionNotImplementedError
Function not implemented.
static void convert(const vpImage< unsigned char > &src, vpImage< vpRGBa > &dest)
Definition of the vpImage class member functions.
std::vector< int > m_classIds
std::vector< float > m_confidences
std::vector< cv::Rect > m_boxes