Visual Servoing Platform version 3.6.0
Loading...
Searching...
No Matches
tutorial-megapose-live-single-object-tracking.cpp
1
2#include <iostream>
3
4#include <visp3/core/vpConfig.h>
5#if (VISP_CXX_STANDARD >= VISP_CXX_STANDARD_17) && defined(VISP_HAVE_NLOHMANN_JSON) && defined(VISP_HAVE_OPENCV) && defined(HAVE_OPENCV_VIDEOIO) && defined(HAVE_OPENCV_DNN) && \
6 (defined(VISP_HAVE_X11) || defined(VISP_HAVE_GDI) || defined(HAVE_OPENCV_HIGHGUI))
7
8#include <optional>
9
10#include <visp3/core/vpIoTools.h>
11#include <visp3/detection/vpDetectorDNNOpenCV.h>
12#include <visp3/gui/vpDisplayGDI.h>
13#include <visp3/gui/vpDisplayOpenCV.h>
14#include <visp3/gui/vpDisplayX.h>
15#include <visp3/dnn_tracker/vpMegaPose.h>
16#include <visp3/dnn_tracker/vpMegaPoseTracker.h>
17#include <visp3/io/vpJsonArgumentParser.h>
18
19#include <nlohmann/json.hpp>
20
21#include <opencv2/videoio.hpp>
22
23
24using json = nlohmann::json;
25
26/*
27 * Interpolate two vpColors. Linear interpolation between each components (R, G, B)
28 *
29 * low starting color
30 * high ending color
31 * f interpolation factor, between 0 and 1
32 * Returns the interpolated color
33 */
34vpColor interpolate(const vpColor &low, const vpColor &high, const float f)
35{
36 const float r = ((float)high.R - (float)low.R) * f;
37 const float g = ((float)high.G - (float)low.G) * f;
38 const float b = ((float)high.B - (float)low.B) * f;
39 return vpColor((unsigned char)r, (unsigned char)g, (unsigned char)b);
40}
41
42/*
43 * Display the Megapose confidence score as a rectangle in the image.
44 * This rectangle becomes green when Megapose is "confident" about its prediction
45 * The confidence score measures whether Megapose can, from its pose estimation, recover the true pose in future pose refinement iterations
46 *
47 * \param[in] I : The image in which to display the confidence.
48 * \param[in] score : The confidence score of Megapose, between 0 and 1.
49 */
50void displayScore(const vpImage<vpRGBa> &I, float score)
51{
52 const unsigned top = static_cast<unsigned>(I.getHeight() * 0.85f);
53 const unsigned height = static_cast<unsigned>(I.getHeight() * 0.1f);
54 const unsigned left = static_cast<unsigned>(I.getWidth() * 0.05f);
55 const unsigned width = static_cast<unsigned>(I.getWidth() * 0.5f);
56 vpRect full(left, top, width, height);
57 vpRect scoreRect(left, top, width * score, height);
58 const vpColor low = vpColor::red;
59 const vpColor high = vpColor::green;
60 const vpColor c = interpolate(low, high, score);
61
62 vpDisplay::displayRectangle(I, full, c, false, 5);
63 vpDisplay::displayRectangle(I, scoreRect, c, true, 1);
64}
65
66/*
67 * Add the Megapose rendering on top of the actual image I.
68 * Require I and overlay to be of the same size.
69 * Note that a fully black object will not render
70*/
71void overlayRender(vpImage<vpRGBa> &I, const vpImage<vpRGBa> &overlay)
72{
73 const vpRGBa black = vpRGBa(0, 0, 0);
74 for (unsigned int i = 0; i < I.getHeight(); ++i) {
75 for (unsigned int j = 0; j < I.getWidth(); ++j) {
76 if (overlay[i][j] != black) {
77 I[i][j] = overlay[i][j];
78 }
79 }
80 }
81}
82
84/*
85 * Run the detection network on an image in order to find a specific object.
86 * The best matching detection is returned:
87 * - If a previous Megapose estimation is available, find the closest match in the image (Euclidean distance between centers)
88 * - Otherwise, take the detection with highest confidence
89 * If no detection corresponding to detectionLabel is found, then std::nullopt is returned
90 */
91std::optional<vpRect> detectObjectForInitMegaposeDnn(vpDetectorDNNOpenCV &detector, const cv::Mat &I,
92 const std::string &detectionLabel,
93 std::optional<vpMegaPoseEstimate> previousEstimate)
94{
95 std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D> detections_vec;
96 detector.detect(I, detections_vec);
97 std::vector<vpDetectorDNNOpenCV::DetectedFeatures2D> matchingDetections;
98 for (const auto &detection : detections_vec) {
99 std::optional<std::string> classnameOpt = detection.getClassName();
100 if (classnameOpt) {
101 if (*classnameOpt == detectionLabel) {
102 matchingDetections.push_back(detection);
103 }
104 }
105 }
106 if (matchingDetections.size() == 0) {
107 return std::nullopt;
108 }
109 else if (matchingDetections.size() == 1) {
110 return matchingDetections[0].getBoundingBox();
111 }
112 else {
113 // Get detection that is closest to previous object bounding box estimated by Megapose
114 if (previousEstimate) {
115 vpRect best;
116 double bestDist = 10000.f;
117 const vpImagePoint previousCenter = (*previousEstimate).boundingBox.getCenter();
118 for (const auto &detection : matchingDetections) {
119 const vpRect detectionBB = detection.getBoundingBox();
120 const vpImagePoint center = detectionBB.getCenter();
121 const double matchDist = vpImagePoint::distance(center, previousCenter);
122 if (matchDist < bestDist) {
123 bestDist = matchDist;
124 best = detectionBB;
125 }
126 }
127 return best;
128
129 }
130 else { // Get detection with highest confidence
131 vpRect best;
132 double highestConf = 0.0;
133 for (const auto &detection : matchingDetections) {
134 const double conf = detection.getConfidenceScore();
135 if (conf > highestConf) {
136 highestConf = conf;
137 best = detection.getBoundingBox();
138 }
139 }
140 return best;
141 }
142 }
143 return std::nullopt;
144}
145
146/*
147 * Ask user to provide the detection themselves. They must click to start labelling, then click on the top left and bottom right corner to create the detection.
148 */
149std::optional<vpRect> detectObjectForInitMegaposeClick(const vpImage<vpRGBa> &I)
150{
151 const bool startLabelling = vpDisplay::getClick(I, false);
152
153 const vpImagePoint textPosition(10.0, 20.0);
154
155 if (startLabelling) {
156 vpImagePoint topLeft, bottomRight;
157 vpDisplay::displayText(I, textPosition, "Click the upper left corner of the bounding box", vpColor::red);
159 vpDisplay::getClick(I, topLeft, true);
161 vpDisplay::displayCross(I, topLeft, 5, vpColor::red, 2);
162 vpDisplay::displayText(I, textPosition, "Click the bottom right corner of the bounding box", vpColor::red);
164 vpDisplay::getClick(I, bottomRight, true);
165 vpRect bb(topLeft, bottomRight);
166 return bb;
167 }
168 else {
170 vpDisplay::displayText(I, textPosition, "Click when the object is visible and static to start reinitializing megapose.", vpColor::red);
172 return std::nullopt;
173 }
174}
176
177enum DetectionMethod
178{
179 UNKNOWN,
180 CLICK,
181 DNN
182};
183
184NLOHMANN_JSON_SERIALIZE_ENUM(DetectionMethod, {
185 {UNKNOWN, nullptr}, // Default value if the json string is not in "current", "desired" or "mean"
186 {CLICK, "click"},
187 {DNN, "dnn"} }
188);
189
190
191int main(int argc, const char *argv [])
192{
193 unsigned width = 640, height = 480;
195 std::string videoDevice = "0";
196 std::string megaposeAddress = "127.0.0.1";
197 unsigned megaposePort = 5555;
198 int refinerIterations = 1, coarseNumSamples = 576;
199 double reinitThreshold = 0.2;
200
201 DetectionMethod detectionMethod = DetectionMethod::UNKNOWN;
202
203 std::string detectorModelPath = "path/to/model.onnx", detectorConfig = "none";
204 std::string detectorFramework = "onnx", detectorTypeString = "yolov7";
205 std::string objectName = "cube";
206 std::vector<std::string> labels = { "cube" };
207 float detectorMeanR = 0.f, detectorMeanG = 0.f, detectorMeanB = 0.f;
208 float detectorConfidenceThreshold = 0.65f, detectorNmsThreshold = 0.5f, detectorFilterThreshold = -0.25f;
209 float detectorScaleFactor = 0.0039f;
210 bool detectorSwapRB = false;
212 vpJsonArgumentParser parser("Single object tracking with Megapose", "--config", "/");
213 parser.addArgument("width", width, true, "The image width")
214 .addArgument("height", height, true, "The image height")
215 .addArgument("camera", cam, true, "The camera intrinsic parameters. Should correspond to a perspective projection model without distortion.")
216 .addArgument("video-device", videoDevice, true, "Video device")
217 .addArgument("object", objectName, true, "Name of the object to track with megapose.")
218 .addArgument("detectionMethod", detectionMethod, true, "How to perform detection of the object to get the bounding box:"
219 " \"click\" for user labelling, \"dnn\" for dnn detection.")
220 .addArgument("reinitThreshold", reinitThreshold, false, "If the Megapose score falls below this threshold, then a reinitialization is be required."
221 " Should be between 0 and 1")
222 .addArgument("megapose/address", megaposeAddress, true, "IP address of the Megapose server.")
223 .addArgument("megapose/port", megaposePort, true, "Port on which the Megapose server listens for connections.")
224 .addArgument("megapose/refinerIterations", refinerIterations, false, "Number of Megapose refiner model iterations."
225 "A higher count may lead to better accuracy, at the cost of more processing time")
226 .addArgument("megapose/initialisationNumSamples", coarseNumSamples, false, "Number of Megapose renderings used for the initial pose estimation.")
227
228 .addArgument("detector/model-path", detectorModelPath, true, "Path to the model")
229 .addArgument("detector/config", detectorConfig, true, "Path to the model configuration. Set to none if config is not required.")
230 .addArgument("detector/framework", detectorFramework, true, "Detector framework")
231 .addArgument("detector/type", detectorTypeString, true, "Detector type")
232 .addArgument("detector/labels", labels, true, "Detection class labels")
233 .addArgument("detector/mean/red", detectorMeanR, false, "Detector mean red component. Used to normalize image")
234 .addArgument("detector/mean/green", detectorMeanG, false, "Detector mean green component. Used to normalize image")
235 .addArgument("detector/mean/blue", detectorMeanB, false, "Detector mean red component. Used to normalize image")
236 .addArgument("detector/confidenceThreshold", detectorConfidenceThreshold, false, "Detector confidence threshold. "
237 "When a detection with a confidence below this threshold, it is ignored")
238 .addArgument("detector/nmsThreshold", detectorNmsThreshold, false, "Detector non maximal suppression threshold.")
239 .addArgument("detector/filterThreshold", detectorFilterThreshold, false)
240 .addArgument("detector/scaleFactor", detectorScaleFactor, false, "Pixel intensity rescaling factor. If set to 1/255, then pixel values are between 0 and 1.")
241 .addArgument("detector/swapRedAndBlue", detectorSwapRB, false, "Whether to swap red and blue channels before feeding the image to the detector.");
242
243 parser.parse(argc, argv);
245
247 throw vpException(vpException::badValue, "The camera projection model should be without distortion, as other models are ignored by Megapose");
248 }
249
250 if (detectionMethod == DetectionMethod::UNKNOWN) {
251 throw vpException(vpException::badValue, "The specified detection method is incorrect: it should be either \"click\" or \"dnn\"");
252 }
253
254 cv::VideoCapture capture;
255 bool isLiveCapture;
256 bool hasCaptureOpeningSucceeded;
257 double videoFrametime = 0; // Only for prerecorded videos
258 if (vpMath::isNumber(videoDevice)) {
259 hasCaptureOpeningSucceeded = capture.open(std::atoi(videoDevice.c_str()));
260 isLiveCapture = true;
261 }
262 else {
263 hasCaptureOpeningSucceeded = capture.open(videoDevice);
264 isLiveCapture = false;
265 double fps = capture.get(cv::CAP_PROP_FPS);
266 videoFrametime = (1.0 / fps) * 1000.0;
267 }
268 if (!hasCaptureOpeningSucceeded) {
269 std::cout << "Capture from camera: " << videoDevice << " didn't work" << std::endl;
270 return EXIT_FAILURE;
271 }
272
274#if defined(VISP_HAVE_X11)
275 vpDisplayX d;
276#elif defined(VISP_HAVE_GDI)
277 vpDisplayGDI d;
278#elif defined(HAVE_OPENCV_HIGHGUI)
280#endif
281 //d.setDownScalingFactor(vpDisplay::SCALE_AUTO);
282#if (VISP_HAVE_OPENCV_VERSION >= 0x030403) && defined(HAVE_OPENCV_DNN) && (VISP_CXX_STANDARD >= VISP_CXX_STANDARD_17)
285 vpDetectorDNNOpenCV::NetConfig netConfig(detectorConfidenceThreshold, detectorNmsThreshold, labels,
286 cv::Size(width, height), detectorFilterThreshold);
287 vpDetectorDNNOpenCV dnn(netConfig, detectorType);
288 if (detectionMethod == DetectionMethod::DNN) {
289 dnn.readNet(detectorModelPath, detectorConfig, detectorFramework);
290 dnn.setMean(detectorMeanR, detectorMeanG, detectorMeanB);
291 dnn.setScaleFactor(detectorScaleFactor);
292 dnn.setSwapRB(detectorSwapRB);
293 }
294#endif
296 std::shared_ptr<vpMegaPose> megapose;
297 try {
298 megapose = std::make_shared<vpMegaPose>(megaposeAddress, megaposePort, cam, height, width);
299 }
300 catch (...) {
301 throw vpException(vpException::ioError, "Could not connect to Megapose server at " + megaposeAddress + " on port " + std::to_string(megaposePort));
302 }
303
304 vpMegaPoseTracker megaposeTracker(megapose, objectName, refinerIterations);
305 megapose->setCoarseNumSamples(coarseNumSamples);
306 const std::vector<std::string> allObjects = megapose->getObjectNames();
307 if (std::find(allObjects.begin(), allObjects.end(), objectName) == allObjects.end()) {
308 throw vpException(vpException::badValue, "Object " + objectName + " is not known by the Megapose server!");
309 }
310 std::future<vpMegaPoseEstimate> trackerFuture;
312
313 cv::Mat frame;
314 vpMegaPoseEstimate megaposeEstimate; // last Megapose estimation
315 vpRect lastDetection; // Last detection (initialization)
316 bool callMegapose = true; // Whether we should call Megapose this iteration
317 bool initialized = false; // Whether tracking should be initialized or reinitialized
318 bool tracking = false;
319
320 bool overlayModel = true;
321 vpImage<vpRGBa> overlayImage(height, width);
322 std::string overlayMode = "full";
323
324 std::vector<double> megaposeTimes;
325 std::vector<double> frameTimes;
326
327 double megaposeStartTime = 0.0;
328
330 while (true) {
331 const double frameStart = vpTime::measureTimeMs();
332 capture >> frame;
333 if (frame.empty())
334 break;
335
336 if (I.getSize() == 0) {
337 vpImageConvert::convert(frame, I);
338 d.init(I);
339 vpDisplay::setTitle(I, "Megapose object pose estimation");
340 }
341 else {
342 vpImageConvert::convert(frame, I);
343 }
346 // Check whether Megapose is still running
348 if (!callMegapose && trackerFuture.wait_for(std::chrono::milliseconds(0)) == std::future_status::ready) {
349 megaposeEstimate = trackerFuture.get();
350 if (tracking) {
351 megaposeTimes.push_back(vpTime::measureTimeMs() - megaposeStartTime);
352 }
353 callMegapose = true;
354 tracking = true;
355
356 if (overlayModel) {
357 overlayImage = megapose->viewObjects({ objectName }, { megaposeEstimate.cTo }, overlayMode);
358 }
359
360 if (megaposeEstimate.score < reinitThreshold) { // If confidence is low, require a reinitialisation with 2D detection
361 initialized = false;
362 }
363 }
366 if (callMegapose) {
367 if (!initialized) {
368 tracking = false;
369 std::optional<vpRect> detection = std::nullopt;
370#if (VISP_HAVE_OPENCV_VERSION >= 0x030403) && defined(HAVE_OPENCV_DNN) && (VISP_CXX_STANDARD >= VISP_CXX_STANDARD_17)
371 if (detectionMethod == DetectionMethod::DNN) {
372 detection = detectObjectForInitMegaposeDnn(
373 dnn, frame, objectName, initialized ? std::optional(megaposeEstimate) : std::nullopt);
374 }
375#endif
376 if (detectionMethod == DetectionMethod::CLICK) {
377 detection = detectObjectForInitMegaposeClick(I);
378 }
379
380 if (detection) {
381 initialized = true;
382 lastDetection = *detection;
383 trackerFuture = megaposeTracker.init(I, lastDetection);
384 callMegapose = false;
385
386 }
387 }
388 else {
389 trackerFuture = megaposeTracker.track(I);
390 callMegapose = false;
391 megaposeStartTime = vpTime::measureTimeMs();
392 }
393 }
395
397 std::string keyboardEvent;
398 const bool keyPressed = vpDisplay::getKeyboardEvent(I, keyboardEvent, false);
399 if (keyPressed) {
400 if (keyboardEvent == "t") {
401 overlayModel = !overlayModel;
402 }
403 else if (keyboardEvent == "w") {
404 overlayMode = overlayMode == "full" ? "wireframe" : "full";
405 }
406 }
407
408 if (tracking) {
409 if (overlayModel) {
410 overlayRender(I, overlayImage);
412 }
413 vpDisplay::displayText(I, 20, 20, "Right click to quit", vpColor::red);
414 vpDisplay::displayText(I, 30, 20, "Press T: Toggle overlay", vpColor::red);
415 vpDisplay::displayText(I, 40, 20, "Press W: Toggle wireframe", vpColor::red);
416 vpDisplay::displayFrame(I, megaposeEstimate.cTo, cam, 0.05, vpColor::none, 3);
417 //vpDisplay::displayRectangle(I, lastDetection, vpColor::red);
418 displayScore(I, megaposeEstimate.score);
419 }
421
423
425 if (vpDisplay::getClick(I, button, false)) {
426 if (button == vpMouseButton::button3) {
427 break; // Right click to stop
428 }
429 }
430 const double frameEnd = vpTime::measureTimeMs();
431 if (!isLiveCapture) {
432 vpTime::wait(std::max(0.0, videoFrametime - (frameEnd - frameStart)));
433 }
434 frameTimes.push_back(vpTime::measureTimeMs() - frameStart);
435 }
436 std::cout << "Average frame time: " << vpMath::getMean(frameTimes) << std::endl;
437 std::cout << "Average time between Megapose calls: " << vpMath::getMean(megaposeTimes) << std::endl;
438}
439
440#else
441int main()
442{
443 std::cout << "Compile ViSP with the DNN tracker module, the JSON 3rd party library and the OpenCV detection module" << std::endl;
444 return EXIT_SUCCESS;
445}
446
447#endif
Generic class defining intrinsic camera parameters.
@ perspectiveProjWithoutDistortion
Perspective projection without distortion model.
vpCameraParametersProjType get_projModel() const
Class to define RGB colors available for display functionalities.
Definition vpColor.h:152
static const vpColor red
Definition vpColor.h:211
static const vpColor none
Definition vpColor.h:223
static const vpColor green
Definition vpColor.h:214
Structure containing some information required for the configuration of a vpDetectorDNNOpenCV object.
json namespace shortcut
DNNResultsParsingType
Enumeration listing the types of DNN for which the vpDetectorDNNOpenCV furnishes the methods permitti...
static DNNResultsParsingType dnnResultsParsingTypeFromString(const std::string &name)
virtual bool detect(const vpImage< unsigned char > &I, std::vector< DetectedFeatures2D > &output)
Object detection using OpenCV DNN module.
Display for windows using GDI (available on any windows 32 platform).
The vpDisplayOpenCV allows to display image using the OpenCV library. Thus to enable this class OpenC...
Use the X11 console to display images on unix-like OS. Thus to enable this class X11 should be instal...
Definition vpDisplayX.h:132
void init(vpImage< unsigned char > &I, int win_x=-1, int win_y=-1, const std::string &win_title="")
static bool getClick(const vpImage< unsigned char > &I, bool blocking=true)
static bool getKeyboardEvent(const vpImage< unsigned char > &I, bool blocking=true)
static void display(const vpImage< unsigned char > &I)
static void displayFrame(const vpImage< unsigned char > &I, const vpHomogeneousMatrix &cMo, const vpCameraParameters &cam, double size, const vpColor &color=vpColor::none, unsigned int thickness=1, const vpImagePoint &offset=vpImagePoint(0, 0), const std::string &frameName="", const vpColor &textColor=vpColor::black, const vpImagePoint &textOffset=vpImagePoint(15, 15))
static void displayCross(const vpImage< unsigned char > &I, const vpImagePoint &ip, unsigned int size, const vpColor &color, unsigned int thickness=1)
static void setTitle(const vpImage< unsigned char > &I, const std::string &windowtitle)
static void flush(const vpImage< unsigned char > &I)
static void displayRectangle(const vpImage< unsigned char > &I, const vpImagePoint &topLeft, unsigned int width, unsigned int height, const vpColor &color, bool fill=false, unsigned int thickness=1)
static void displayText(const vpImage< unsigned char > &I, const vpImagePoint &ip, const std::string &s, const vpColor &color)
error that can be emitted by ViSP classes.
Definition vpException.h:59
@ ioError
I/O error.
Definition vpException.h:79
@ badValue
Used to indicate that a value is not in the allowed range.
Definition vpException.h:85
static void convert(const vpImage< unsigned char > &src, vpImage< vpRGBa > &dest)
Class that defines a 2D point in an image. This class is useful for image processing and stores only ...
static double distance(const vpImagePoint &iP1, const vpImagePoint &iP2)
Definition of the vpImage class member functions.
Definition vpImage.h:135
unsigned int getWidth() const
Definition vpImage.h:242
unsigned int getSize() const
Definition vpImage.h:223
unsigned int getHeight() const
Definition vpImage.h:184
Command line argument parsing with support for JSON files. If a JSON file is supplied,...
static double getMean(const std::vector< double > &v)
Definition vpMath.cpp:294
static bool isNumber(const std::string &str)
Definition vpMath.cpp:215
vpHomogeneousMatrix cTo
Definition vpMegaPose.h:69
A simplified interface to track a single object with MegaPose. This tracker works asynchronously: A c...
unsigned char B
Blue component.
Definition vpRGBa.h:140
unsigned char R
Red component.
Definition vpRGBa.h:138
unsigned char G
Green component.
Definition vpRGBa.h:139
Defines a rectangle in the plane.
Definition vpRect.h:76
void getCenter(double &x, double &y) const
Definition vpRect.h:133
VISP_EXPORT int wait(double t0, double t)
VISP_EXPORT double measureTimeMs()