dlib C++ Library - structural_svm_object_detection

// Copyright (C) 2011  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.
#ifndef DLIB_STRUCTURAL_SVM_ObJECT_DETECTION_PROBLEM_Hh_
#define DLIB_STRUCTURAL_SVM_ObJECT_DETECTION_PROBLEM_Hh_

#include "structural_svm_object_detection_problem_abstract.h"
#include "../matrix.h"
#include "structural_svm_problem_threaded.h"
#include <sstream>
#include "../string.h"
#include "../array.h"
#include "../image_processing/full_object_detection.h"
#include "../image_processing/box_overlap_testing.h"

namespace dlib
{

// ----------------------------------------------------------------------------------------

    template <
        typename image_scanner_type,
        typename image_array_type 
        >
    class structural_svm_object_detection_problem : public structural_svm_problem_threaded<matrix<double,0,1> >,
                                                    noncopyable
    {
    public:

        structural_svm_object_detection_problem(
            const image_scanner_type& scanner,
            const test_box_overlap& overlap_tester,
            const bool auto_overlap_tester,
            const image_array_type& images_,
            const std::vector<std::vector<full_object_detection> >& truth_object_detections_,
            const std::vector<std::vector<rectangle> >& ignore_,
            const test_box_overlap& ignore_overlap_tester_,
            unsigned long num_threads = 2
        ) :
            structural_svm_problem_threaded<matrix<double,0,1> >(num_threads),
            boxes_overlap(overlap_tester),
            images(images_),
            truth_object_detections(truth_object_detections_),
            ignore(ignore_),
            ignore_overlap_tester(ignore_overlap_tester_),
            match_eps(0.5),
            loss_per_false_alarm(1),
            loss_per_missed_target(1)
        {
#ifdef ENABLE_ASSERTS
            // make sure requires clause is not broken
            DLIB_ASSERT(is_learning_problem(images_, truth_object_detections_) && 
                        ignore_.size() == images_.size() &&
                         scanner.get_num_detection_templates() > 0,
                "\t structural_svm_object_detection_problem::structural_svm_object_detection_problem()"
                << "\n\t Invalid inputs were given to this function "
                << "\n\t scanner.get_num_detection_templates(): " << scanner.get_num_detection_templates()
                << "\n\t is_learning_problem(images_,truth_object_detections_): " << is_learning_problem(images_,truth_object_detections_)
                << "\n\t ignore.size(): " << ignore.size() 
                << "\n\t images.size(): " << images.size() 
                << "\n\t this: " << this
                );
            for (unsigned long i = 0; i < truth_object_detections.size(); ++i)
            {
                for (unsigned long j = 0; j < truth_object_detections[i].size(); ++j)
                {
                    DLIB_ASSERT(truth_object_detections[i][j].num_parts() == scanner.get_num_movable_components_per_detection_template(),
                        "\t trained_function_type structural_object_detection_trainer::train()"
                        << "\n\t invalid inputs were given to this function"
                        << "\n\t truth_object_detections["<<i<<"]["<<j<<"].num_parts():          " << 
                            truth_object_detections[i][j].num_parts()
                        << "\n\t scanner.get_num_movable_components_per_detection_template(): " << 
                            scanner.get_num_movable_components_per_detection_template()
                        << "\n\t all_parts_in_rect(truth_object_detections["<<i<<"]["<<j<<"]): " << all_parts_in_rect(truth_object_detections[i][j])
                    );
                }
            }
#endif
            // The purpose of the max_num_dets member variable is to give us a reasonable
            // upper limit on the number of detections we can expect from a single image.
            // This is used in the separation_oracle to put a hard limit on the number of
            // detections we will consider.  We do this purely for computational reasons
            // since otherwise we can end up wasting large amounts of time on certain
            // pathological cases during optimization which ultimately do not influence the
            // result.  Therefore, we force the separation oracle to only consider the
            // max_num_dets strongest detections.
            max_num_dets = 0;
            for (unsigned long i = 0; i < truth_object_detections.size(); ++i)
            {
                if (truth_object_detections[i].size() > max_num_dets)
                    max_num_dets = truth_object_detections[i].size();
            }
            max_num_dets = max_num_dets*3 + 10;

            initialize_scanners(scanner, num_threads);

            if (auto_overlap_tester)
            {
                auto_configure_overlap_tester();
            }
        }

        test_box_overlap get_overlap_tester (
        ) const 
        {
            return boxes_overlap;
        }

        void set_match_eps (
            double eps
        )
        {
            // make sure requires clause is not broken
            DLIB_ASSERT(0 < eps && eps < 1, 
                "\t void structural_svm_object_detection_problem::set_match_eps(eps)"
                << "\n\t Invalid inputs were given to this function "
                << "\n\t eps:  " << eps 
                << "\n\t this: " << this
                );

            match_eps = eps;
        }

        double get_match_eps (
        ) const
        {
            return match_eps;
        }

        double get_loss_per_missed_target (
        ) const
        {
            return loss_per_missed_target;
        }

        void set_loss_per_missed_target (
            double loss
        )
        {
            // make sure requires clause is not broken
            DLIB_ASSERT(loss > 0, 
                "\t void structural_svm_object_detection_problem::set_loss_per_missed_target(loss)"
                << "\n\t Invalid inputs were given to this function "
                << "\n\t loss: " << loss
                << "\n\t this: " << this
                );

            loss_per_missed_target = loss;
        }

        double get_loss_per_false_alarm (
        ) const
        {
            return loss_per_false_alarm;
        }

        void set_loss_per_false_alarm (
            double loss
        )
        {
            // make sure requires clause is not broken
            DLIB_ASSERT(loss > 0, 
                "\t void structural_svm_object_detection_problem::set_loss_per_false_alarm(loss)"
                << "\n\t Invalid inputs were given to this function "
                << "\n\t loss: " << loss
                << "\n\t this: " << this
                );

            loss_per_false_alarm = loss;
        }

    private:

        void auto_configure_overlap_tester(
        )
        {
            std::vector<std::vector<rectangle> > mapped_rects(truth_object_detections.size());
            for (unsigned long i = 0; i < truth_object_detections.size(); ++i)
            {
                mapped_rects[i].resize(truth_object_detections[i].size());
                for (unsigned long j = 0; j < truth_object_detections[i].size(); ++j)
                {
                    mapped_rects[i][j] = scanners[i].get_best_matching_rect(truth_object_detections[i][j].get_rect());
                }
            }

            boxes_overlap = find_tight_overlap_tester(mapped_rects);
        }


        virtual long get_num_dimensions (
        ) const 
        {
            return scanners[0].get_num_dimensions() + 
                1;// for threshold
        }

        virtual long get_num_samples (
        ) const 
        {
            return images.size();
        }

        virtual void get_truth_joint_feature_vector (
            long idx,
            feature_vector_type& psi 
        ) const 
        {
            const image_scanner_type& scanner = scanners[idx];

            psi.set_size(get_num_dimensions());
            std::vector<rectangle> mapped_rects;

            psi = 0;
            for (unsigned long i = 0; i < truth_object_detections[idx].size(); ++i)
            {
                mapped_rects.push_back(scanner.get_best_matching_rect(truth_object_detections[idx][i].get_rect()));
                scanner.get_feature_vector(truth_object_detections[idx][i], psi);
            }
            psi(scanner.get_num_dimensions()) = -1.0*truth_object_detections[idx].size();

            // check if any of the boxes overlap.  If they do then it is impossible for
            // us to learn to correctly classify this sample
            for (unsigned long i = 0; i < mapped_rects.size(); ++i)
            {
                for (unsigned long j = i+1; j < mapped_rects.size(); ++j)
                {
                    if (boxes_overlap(mapped_rects[i], mapped_rects[j]))
                    {
                        const double area_overlap = mapped_rects[i].intersect(mapped_rects[j]).area();
                        const double match_amount = area_overlap/(double)( mapped_rects[i]+mapped_rects[j]).area();
                        const double overlap_amount = area_overlap/std::min(mapped_rects[i].area(),mapped_rects[j].area());

                        using namespace std;
                        ostringstream sout;
                        sout << "An impossible set of object labels was detected. This is happening because ";
                        sout << "the truth labels for an image contain rectangles which overlap according to the ";
                        sout << "test_box_overlap object supplied for non-max suppression.  To resolve this, you ";
                        sout << "either need to relax the test_box_overlap object so it doesn't mark these rectangles as ";
                        sout << "overlapping or adjust the truth rectangles. ";

                        // make sure the above string fits nicely into a command prompt window.
                        string temp = sout.str();
                        sout.str(""); sout << wrap_string(temp,0,0) << endl << endl;


                        sout << "image index: "<< idx << endl;
                        sout << "The offending rectangles are:\n";
                        sout << "rect1: "<< mapped_rects[i] << endl;
                        sout << "rect2: "<< mapped_rects[j] << endl;
                        sout << "match amount:   " << match_amount << endl;
                        sout << "overlap amount: " << overlap_amount << endl;
                        throw dlib::impossible_labeling_error(sout.str());
                    }
                }
            }

            // make sure the mapped rectangles are within match_eps of the
            // truth rectangles.
            for (unsigned long i = 0; i < mapped_rects.size(); ++i)
            {
                const double area = (truth_object_detections[idx][i].get_rect().intersect(mapped_rects[i])).area();
                const double total_area = (truth_object_detections[idx][i].get_rect() + mapped_rects[i]).area();
                if (area/total_area <= match_eps)
                {
                    using namespace std;
                    ostringstream sout;
                    sout << "An impossible set of object labels was detected.  This is happening because ";
                    sout << "none of the object locations checked by the supplied image scanner is a close ";
                    sout << "enough match to one of the truth boxes.  To resolve this you need to either lower the match_eps ";
                    sout << "or adjust the settings of the image scanner so that it hits this truth box.  ";
                    sout << "Or you could adjust the ";
                    sout << "offending truth rectangle so it can be matched by the current image scanner.  Also, if you ";
                    sout << "are using the scan_image_pyramid object then you could try using a finer image pyramid ";
                    sout << "or adding more detection templates.  E.g. if one of ";
                    sout << "your existing detection templates has a matching width/height ratio and smaller area ";
                    sout << "than the offending rectangle then a finer image pyramid would probably help.";


                    // make sure the above string fits nicely into a command prompt window.
                    string temp = sout.str();
                    sout.str(""); sout << wrap_string(temp,0,0) << endl << endl;

                    sout << "image index              "<< idx << endl;
                    sout << "match_eps:               "<< match_eps << endl;
                    sout << "best possible match:     "<< area/total_area << endl;
                    sout << "truth rect:              "<< truth_object_detections[idx][i].get_rect() << endl;
                    sout << "truth rect width/height: "<< truth_object_detections[idx][i].get_rect().width()/(double)truth_object_detections[idx][i].get_rect().height() << endl;
                    sout << "truth rect area:         "<< truth_object_detections[idx][i].get_rect().area() << endl;
                    sout << "nearest detection template rect:              "<< mapped_rects[i] << endl;
                    sout << "nearest detection template rect width/height: "<< mapped_rects[i].width()/(double)mapped_rects[i].height() << endl;
                    sout << "nearest detection template rect area:         "<< mapped_rects[i].area() << endl;
                    throw dlib::impossible_labeling_error(sout.str());
                }

            }
        }

        virtual void separation_oracle (
            const long idx,
            const matrix_type& current_solution,
            scalar_type& loss,
            feature_vector_type& psi
        ) const 
        {
            const image_scanner_type& scanner = scanners[idx];

            std::vector<std::pair<double, rectangle> > dets;
            const double thresh = current_solution(scanner.get_num_dimensions());


            scanner.detect(current_solution, dets, thresh-loss_per_false_alarm);


            // The loss will measure the number of incorrect detections.  A detection is
            // incorrect if it doesn't hit a truth rectangle or if it is a duplicate detection
            // on a truth rectangle.
            loss = truth_object_detections[idx].size()*loss_per_missed_target;

            // Measure the loss augmented score for the detections which hit a truth rect.
            std::vector<double> truth_score_hits(truth_object_detections[idx].size(), 0);

            // keep track of which truth boxes we have hit so far.
            std::vector<bool> hit_truth_table(truth_object_detections[idx].size(), false);

            std::vector<rectangle> final_dets;
            // The point of this loop is to fill out the truth_score_hits array. 
            for (unsigned long i = 0; i < dets.size() && final_dets.size() < max_num_dets; ++i)
            {
                if (overlaps_any_box(boxes_overlap, final_dets, dets[i].second))
                    continue;

                const std::pair<double,unsigned int> truth = find_best_match(truth_object_detections[idx], dets[i].second);

                final_dets.push_back(dets[i].second);

                const double truth_match = truth.first;
                // if hit truth rect
                if (truth_match > match_eps)
                {
                    // if this is the first time we have seen a detect which hit truth_object_detections[idx][truth.second]
                    const double score = dets[i].first - thresh;
                    if (hit_truth_table[truth.second] == false)
                    {
                        hit_truth_table[truth.second] = true;
                        truth_score_hits[truth.second] += score;
                    }
                    else
                    {
                        truth_score_hits[truth.second] += score + loss_per_false_alarm;
                    }
                }
            }

            hit_truth_table.assign(hit_truth_table.size(), false);

            final_dets.clear();
#ifdef ENABLE_ASSERTS
            double total_score = 0;
#endif
            // Now figure out which detections jointly maximize the loss and detection score sum.  We
            // need to take into account the fact that allowing a true detection in the output, while 
            // initially reducing the loss, may allow us to increase the loss later with many duplicate
            // detections.
            for (unsigned long i = 0; i < dets.size() && final_dets.size() < max_num_dets; ++i)
            {
                if (overlaps_any_box(boxes_overlap, final_dets, dets[i].second))
                    continue;

                const std::pair<double,unsigned int> truth = find_best_match(truth_object_detections[idx], dets[i].second);

                const double truth_match = truth.first;
                if (truth_match > match_eps)
                {
                    if (truth_score_hits[truth.second] > loss_per_missed_target)
                    {
                        if (!hit_truth_table[truth.second])
                        {
                            hit_truth_table[truth.second] = true;
                            final_dets.push_back(dets[i].second);
#ifdef ENABLE_ASSERTS
                            total_score += dets[i].first;
#endif
                            loss -= loss_per_missed_target;
                        }
                        else
                        {
                            final_dets.push_back(dets[i].second);
#ifdef ENABLE_ASSERTS
                            total_score += dets[i].first;
#endif
                            loss += loss_per_false_alarm;
                        }
                    }
                }
                else if (!overlaps_ignore_box(idx,dets[i].second))
                {
                    // didn't hit anything
                    final_dets.push_back(dets[i].second);
#ifdef ENABLE_ASSERTS
                    total_score += dets[i].first;
#endif
                    loss += loss_per_false_alarm;
                }
            }

            psi.set_size(get_num_dimensions());
            psi = 0;
            for (unsigned long i = 0; i < final_dets.size(); ++i)
                scanner.get_feature_vector(scanner.get_full_object_detection(final_dets[i], current_solution), psi);

#ifdef ENABLE_ASSERTS
            const double psi_score = dot(psi, current_solution);
            DLIB_CASSERT(std::abs(psi_score-total_score) <= 1e-4 * std::max(1.0,std::max(std::abs(psi_score),std::abs(total_score))),
                        "\t The get_feature_vector() and detect() methods of image_scanner_type are not in sync." 
                        << "\n\t The relative error is too large to be attributed to rounding error."
                        << "\n\t error:       " << std::abs(psi_score-total_score)
                        << "\n\t psi_score:   " << psi_score
                        << "\n\t total_score: " << total_score
            );
#endif

            psi(scanner.get_num_dimensions()) = -1.0*final_dets.size();
        }


        bool overlaps_ignore_box (
            const long idx,
            const dlib::rectangle& rect
        ) const
        {
            for (unsigned long i = 0; i < ignore[idx].size(); ++i)
            {
                if (ignore_overlap_tester(ignore[idx][i], rect))
                    return true;
            }
            return false;
        }

        std::pair<double,unsigned int> find_best_match(
            const std::vector<full_object_detection>& boxes,
            const rectangle rect
        ) const
        /*!
            ensures
                - determines which rectangle in boxes matches rect the most and
                  returns the amount of this match.  Specifically, the match is
                  a number O with the following properties:
                    - 0 <= O <= 1
                    - Let R be the maximum matching rectangle in boxes, then
                      O == (R.intersect(rect)).area() / (R + rect).area()
                    - O == 0 if there is no match with any rectangle.
        !*/
        {
            double match = 0;
            unsigned int best_idx = 0;
            for (unsigned long i = 0; i < boxes.size(); ++i)
            {

                const unsigned long area = rect.intersect(boxes[i].get_rect()).area();
                if (area != 0)
                {
                    const double new_match = area / static_cast<double>((rect + boxes[i].get_rect()).area());
                    if (new_match > match)
                    {
                        match = new_match;
                        best_idx = i;
                    }
                }
            }

            return std::make_pair(match,best_idx);
        }

        struct init_scanners_helper
        {
            init_scanners_helper (
                array<image_scanner_type>& scanners_,
                const image_array_type& images_
            ) :
                scanners(scanners_),
                images(images_)
            {}

            array<image_scanner_type>& scanners;
            const image_array_type& images;

            void operator() (long i ) const
            {
                scanners[i].load(images[i]);
            }
        };

        void initialize_scanners (
            const image_scanner_type& scanner,
            unsigned long num_threads
        )
        {
            scanners.set_max_size(images.size());
            scanners.set_size(images.size());

            for (unsigned long i = 0; i < scanners.size(); ++i)
                scanners[i].copy_configuration(scanner);

            // now load the images into all the scanners
            parallel_for(num_threads, 0, scanners.size(), init_scanners_helper(scanners, images));
        }


        test_box_overlap boxes_overlap;

        mutable array<image_scanner_type> scanners;

        const image_array_type& images;
        const std::vector<std::vector<full_object_detection> >& truth_object_detections;
        const std::vector<std::vector<rectangle> >& ignore;
        const test_box_overlap ignore_overlap_tester;

        unsigned long max_num_dets;
        double match_eps;
        double loss_per_false_alarm;
        double loss_per_missed_target;
    };

// ----------------------------------------------------------------------------------------

}

#endif // DLIB_STRUCTURAL_SVM_ObJECT_DETECTION_PROBLEM_Hh_