// Copyright (C) 2013 Davis E. King (davis@dlib.net) // License: Boost Software License See LICENSE.txt for the full license. #undef DLIB_SCAN_fHOG_PYRAMID_ABSTRACT_Hh_ #ifdef DLIB_SCAN_fHOG_PYRAMID_ABSTRACT_Hh_ #include <vector> #include "../image_transforms/fhog_abstract.h" #include "object_detector_abstract.h" namespace dlib { // ---------------------------------------------------------------------------------------- template < typename Pyramid_type, typename feature_extractor_type > matrix<unsigned char> draw_fhog ( const object_detector<scan_fhog_pyramid<Pyramid_type,feature_extractor_type> >& detector, const unsigned long weight_index = 0, const long cell_draw_size = 15 ); /*! requires - cell_draw_size > 0 - weight_index < detector.num_detectors() - detector.get_w(weight_index).size() >= detector.get_scanner().get_num_dimensions() (i.e. the detector must have been populated with a HOG filter) ensures - Converts the HOG filters in the given detector (specifically, the filters in detector.get_w(weight_index)) into an image suitable for display on the screen. In particular, we draw all the HOG cells into a grayscale image in a way that shows the magnitude and orientation of the gradient energy in each cell. The resulting image is then returned. !*/ // ---------------------------------------------------------------------------------------- template < typename Pyramid_type, typename feature_extractor_type > unsigned long num_separable_filters ( const object_detector<scan_fhog_pyramid<Pyramid_type,feature_extractor_type> >& detector, const unsigned long weight_index = 0 ); /*! requires - weight_index < detector.num_detectors() - detector.get_w(weight_index).size() >= detector.get_scanner().get_num_dimensions() (i.e. the detector must have been populated with a HOG filter) ensures - Returns the number of separable filters necessary to represent the HOG filters in the given detector's weight_index'th filter. This is the filter defined by detector.get_w(weight_index). !*/ // ---------------------------------------------------------------------------------------- template < typename Pyramid_type, typename feature_extractor_type > object_detector<scan_fhog_pyramid<Pyramid_type,feature_extractor_type> > threshold_filter_singular_values ( const object_detector<scan_fhog_pyramid<Pyramid_type,feature_extractor_type> >& detector, double thresh, const unsigned long weight_index = 0 ); /*! requires - thresh >= 0 - weight_index < detector.num_detectors() - detector.get_w(weight_index).size() >= detector.get_scanner().get_num_dimensions() (i.e. the detector must have been populated with a HOG filter) ensures - Removes all components of the filters in the given detector that have singular values that are smaller than the given threshold. Therefore, this function allows you to control how many separable filters are in a detector. In particular, as thresh gets larger the quantity num_separable_filters(threshold_filter_singular_values(detector,thresh,weight_index),weight_index) will generally get smaller and therefore give a faster running detector. However, note that at some point a large enough thresh will drop too much information from the filters and their accuracy will suffer. - returns the updated detector !*/ // ---------------------------------------------------------------------------------------- class default_fhog_feature_extractor { /*! WHAT THIS OBJECT REPRESENTS The scan_fhog_pyramid object defined below is primarily meant to be used with the feature extraction technique implemented by extract_fhog_features(). This technique can generally be understood as taking an input image and outputting a multi-planed output image of floating point numbers that somehow describe the image contents. Since there are many ways to define how this feature mapping is performed, the scan_fhog_pyramid allows you to replace the extract_fhog_features() method with a customized method of your choosing. To do this you implement a class with the same interface as default_fhog_feature_extractor. Therefore, the point of default_fhog_feature_extractor is two fold. First, it provides the default FHOG feature extraction method used by scan_fhog_pyramid. Second, it serves to document the interface you need to implement to define your own custom HOG style feature extraction. !*/ public: rectangle image_to_feats ( const rectangle& rect, int cell_size, int filter_rows_padding, int filter_cols_padding ) const { return image_to_fhog(rect, cell_size, filter_rows_padding, filter_cols_padding); } /*! requires - cell_size > 0 - filter_rows_padding > 0 - filter_cols_padding > 0 ensures - Maps a rectangle from the coordinates in an input image to the corresponding area in the output feature image. !*/ rectangle feats_to_image ( const rectangle& rect, int cell_size, int filter_rows_padding, int filter_cols_padding ) const { return fhog_to_image(rect, cell_size, filter_rows_padding, filter_cols_padding); } /*! requires - cell_size > 0 - filter_rows_padding > 0 - filter_cols_padding > 0 ensures - Maps a rectangle from the coordinates of the hog feature image back to the input image. - Mapping from feature space to image space is an invertible transformation. That is, for any rectangle R we have: R == image_to_feats(feats_to_image(R,cell_size,filter_rows_padding,filter_cols_padding), cell_size,filter_rows_padding,filter_cols_padding). !*/ template < typename image_type > void operator()( const image_type& img, dlib::array<array2d<float> >& hog, int cell_size, int filter_rows_padding, int filter_cols_padding ) const { extract_fhog_features(img,hog,cell_size,filter_rows_padding,filter_cols_padding); } /*! requires - image_type == is an implementation of array2d/array2d_kernel_abstract.h - img contains some kind of pixel type. (i.e. pixel_traits<typename image_type::type> is defined) ensures - Extracts FHOG features by calling extract_fhog_features(). The results are stored into #hog. Note that if you are implementing your own feature extractor you can pretty much do whatever you want in terms of feature extraction so long as the following conditions are met: - #hog.size() == get_num_planes() - Each image plane in #hog has the same dimensions. - for all valid i, r, and c: - #hog[i][r][c] == a feature value describing the image content centered at the following pixel location in img: feats_to_image(point(c,r),cell_size,filter_rows_padding,filter_cols_padding) !*/ inline unsigned long get_num_planes ( ) const { return 31; } /*! ensures - returns the number of planes in the hog image output by the operator() method. !*/ }; inline void serialize (const default_fhog_feature_extractor&, std::ostream&) {} inline void deserialize (default_fhog_feature_extractor&, std::istream&) {} /*! Provides serialization support. Note that there is no state in the default hog feature extractor so these functions do nothing. But if you define a custom feature extractor then make sure you remember to serialize any state in your feature extractor. !*/ // ---------------------------------------------------------------------------------------- template < typename Pyramid_type, typename Feature_extractor_type = default_fhog_feature_extractor > class scan_fhog_pyramid : noncopyable { /*! REQUIREMENTS ON Pyramid_type - Must be one of the pyramid_down objects defined in dlib/image_transforms/image_pyramid_abstract.h or an object with a compatible interface REQUIREMENTS ON Feature_extractor_type - Must be a type with an interface compatible with the default_fhog_feature_extractor. INITIAL VALUE - get_padding() == 1 - get_cell_size() == 8 - get_detection_window_width() == 64 - get_detection_window_height() == 64 - get_max_pyramid_levels() == 1000 - get_min_pyramid_layer_width() == 64 - get_min_pyramid_layer_height() == 64 - get_nuclear_norm_regularization_strength() == 0 WHAT THIS OBJECT REPRESENTS This object is a tool for running a fixed sized sliding window classifier over an image pyramid. In particular, it slides a linear classifier over a HOG pyramid as discussed in the paper: Histograms of Oriented Gradients for Human Detection by Navneet Dalal and Bill Triggs, CVPR 2005 However, we augment the method slightly to use the version of HOG features from: Object Detection with Discriminatively Trained Part Based Models by P. Felzenszwalb, R. Girshick, D. McAllester, D. Ramanan IEEE Transactions on Pattern Analysis and Machine Intelligence, Vol. 32, No. 9, Sep. 2010 Since these HOG features have been shown to give superior performance. THREAD SAFETY Concurrent access to an instance of this object is not safe and should be protected by a mutex lock except for the case where you are copying the configuration (via copy_configuration()) of a scan_fhog_pyramid object to many other threads. In this case, it is safe to copy the configuration of a shared object so long as no other operations are performed on it. !*/ public: typedef matrix<double,0,1> feature_vector_type; typedef Pyramid_type pyramid_type; typedef Feature_extractor_type feature_extractor_type; scan_fhog_pyramid ( ); /*! ensures - this object is properly initialized !*/ explicit scan_fhog_pyramid ( const feature_extractor_type& fe ); /*! ensures - this object is properly initialized - #get_feature_extractor() == fe !*/ template < typename image_type > void load ( const image_type& img ); /*! requires - image_type == is an implementation of array2d/array2d_kernel_abstract.h - img contains some kind of pixel type. (i.e. pixel_traits<typename image_type::type> is defined) ensures - #is_loaded_with_image() == true - This object is ready to run a classifier over img to detect object locations. Call detect() to do this. !*/ const feature_extractor_type& get_feature_extractor( ) const; /*! ensures - returns a const reference to the feature extractor used by this object. !*/ bool is_loaded_with_image ( ) const; /*! ensures - returns true if this object has been loaded with an image to process and false otherwise. !*/ void copy_configuration ( const scan_fhog_pyramid& item ); /*! ensures - Copies all the state information of item into *this, except for state information populated by load(). More precisely, given two scan_fhog_pyramid objects S1 and S2, the following sequence of instructions should always result in both of them having the exact same state: S2.copy_configuration(S1); S1.load(img); S2.load(img); !*/ void set_detection_window_size ( unsigned long window_width, unsigned long window_height ); /*! requires - window_width > 0 - window_height > 0 ensures - When detect() is called, this object scans a window that is of the given width and height (in pixels) over each layer in an image pyramid. This means that the rectangle detections which come out of detect() will have a width to height ratio approximately equal to window_width/window_height and will be approximately window_width*window_height pixels in area or larger. Therefore, the smallest object that can be detected is roughly window_width by window_height pixels in size. - #get_detection_window_width() == window_width - #get_detection_window_height() == window_height - Since we use a HOG feature representation, the detection procedure works as follows: Step 1. Make an image pyramid. Step 2. Convert each layer of the image pyramid into a multi-planed HOG "image". (the number of bands is given by get_feature_extractor().get_num_planes()) Step 3. Scan a linear classifier over each HOG image in the pyramid. Moreover, the HOG features quantize the input image into a grid of cells, each cell being get_cell_size() by get_cell_size() pixels in size. So when we scan the object detector over the pyramid we are scanning an appropriately sized window over these smaller quantized HOG features. In particular, the size of the window we scan over the HOG feature pyramid is #get_fhog_window_width() by #get_fhog_window_height() HOG cells in size. - #is_loaded_with_image() == false !*/ unsigned long get_detection_window_width ( ) const; /*! ensures - returns the width, in pixels, of the detection window that is scanned over the image when detect() is called. !*/ inline unsigned long get_detection_window_height ( ) const; /*! ensures - returns the height, in pixels, of the detection window that is scanned over the image when detect() is called. !*/ unsigned long get_fhog_window_width ( ) const; /*! ensures - Returns the width of the HOG scanning window in terms of HOG cell blocks. Note that this is a function of get_detection_window_width(), get_cell_size(), and get_padding() and is therefore not something you set directly. - #get_fhog_window_width() is approximately equal to the number of HOG cells that fit into get_detection_window_width() pixels plus 2*get_padding() since we include additional padding around each window to add context. !*/ unsigned long get_fhog_window_height ( ) const; /*! ensures - Returns the height of the HOG scanning window in terms of HOG cell blocks. Note that this is a function of get_detection_window_height(), get_cell_size(), and get_padding() and is therefore not something you set directly. - #get_fhog_window_height() is approximately equal to the number of HOG cells that fit into get_detection_window_height() pixels plus 2*get_padding() since we include additional padding around each window to add context. !*/ void set_padding ( unsigned long new_padding ); /*! ensures - #get_padding() == new_padding - #is_loaded_with_image() == false !*/ unsigned long get_padding ( ) const; /*! ensures - The HOG windows scanned over the HOG pyramid can include additional HOG cells outside the detection window. This can help add context and improve detection accuracy. This function returns the number of extra HOG cells added onto the border of the HOG windows which are scanned by detect(). !*/ unsigned long get_cell_size ( ) const; /*! ensures - Returns the size of the HOG cells. Each HOG cell is square and contains get_cell_size()*get_cell_size() pixels. !*/ void set_cell_size ( unsigned long new_cell_size ); /*! requires - new_cell_size > 0 ensures - #get_cell_size() == new_cell_size - #is_loaded_with_image() == false !*/ inline long get_num_dimensions ( ) const; /*! ensures - returns get_fhog_window_width()*get_fhog_window_height()*get_feature_extractor().get_num_planes() (i.e. The number of features is equal to the size of the HOG window times the number of planes output by the feature extractor. ) !*/ inline unsigned long get_num_detection_templates ( ) const { return 1; } /*! ensures - returns 1. Note that this function is here only for compatibility with the scan_image_pyramid object. Notionally, its return value indicates that a scan_fhog_pyramid object is always ready to detect objects once an image has been loaded. !*/ inline unsigned long get_num_movable_components_per_detection_template ( ) const { return 0; } /*! ensures - returns 0. Note that this function is here only for compatibility with the scan_image_pyramid object. Its return value means that this object does not support using movable part models. !*/ unsigned long get_max_pyramid_levels ( ) const; /*! ensures - returns the maximum number of image pyramid levels this object will use. Note that #get_max_pyramid_levels() == 1 indicates that no image pyramid will be used at all. That is, only the original image will be processed and no lower scale versions will be created. !*/ void set_max_pyramid_levels ( unsigned long max_levels ); /*! requires - max_levels > 0 ensures - #get_max_pyramid_levels() == max_levels !*/ void set_min_pyramid_layer_size ( unsigned long width, unsigned long height ); /*! requires - width > 0 - height > 0 ensures - #get_min_pyramid_layer_width() == width - #get_min_pyramid_layer_height() == height !*/ inline unsigned long get_min_pyramid_layer_width ( ) const; /*! ensures - returns the smallest allowable width of an image in the image pyramid. All pyramids will always include the original input image, however, no pyramid levels will be created which have a width smaller than the value returned by this function. !*/ inline unsigned long get_min_pyramid_layer_height ( ) const; /*! ensures - returns the smallest allowable height of an image in the image pyramid. All pyramids will always include the original input image, however, no pyramid levels will be created which have a height smaller than the value returned by this function. !*/ fhog_filterbank build_fhog_filterbank ( const feature_vector_type& weights ) const; /*! requires - weights.size() >= get_num_dimensions() ensures - Creates and then returns a fhog_filterbank object FB such that: - FB.get_num_dimensions() == get_num_dimensions() - FB.get_filters() == the values in weights unpacked into get_feature_extractor().get_num_planes() filters. - FB.num_separable_filters() == the number of separable filters necessary to represent all the filters in FB.get_filters(). !*/ class fhog_filterbank { /*! WHAT THIS OBJECT REPRESENTS This object represents a HOG filter bank. That is, the classifier that is slid over a HOG pyramid is a set of get_feature_extractor().get_num_planes() linear filters, each get_fhog_window_width() rows by get_fhog_window_height() columns in size. This object contains that set of filters. !*/ public: long get_num_dimensions( ) const; /*! ensures - Returns the total number of values in the filters. !*/ const std::vector<matrix<float> >& get_filters( ) const; /*! ensures - returns the set of HOG filters in this object. !*/ unsigned long num_separable_filters( ) const; /*! ensures - returns the number of separable filters necessary to represent all the filters in get_filters(). !*/ }; void detect ( const fhog_filterbank& w, std::vector<std::pair<double, rectangle> >& dets, const double thresh ) const; /*! requires - w.get_num_dimensions() == get_num_dimensions() - is_loaded_with_image() == true ensures - Scans the HOG filter defined by w over the HOG pyramid that was populated by the last call to load() and stores all object detections into #dets. - for all valid i: - #dets[i].second == The object box which produced this detection. This rectangle gives the location of the detection. Note that the rectangle will have been converted back into the original image input space. That is, if this detection was made at a low level in the image pyramid then the object box will have been automatically mapped up the pyramid layers to the original image space. Or in other words, if you plot #dets[i].second on top of the image given to load() it will show up in the right place. - #dets[i].first == The score for this detection. This value is equal to dot(w, feature vector for this sliding window location). - #dets[i].first >= thresh - #dets will be sorted in descending order. (i.e. #dets[i].first >= #dets[j].first for all i, and j>i) - Elements of w beyond index get_num_dimensions()-1 are ignored. I.e. only the first get_num_dimensions() are used. - Note that no form of non-max suppression is performed. If a window has a score >= thresh then it is reported in #dets. !*/ void detect ( const feature_vector_type& w, std::vector<std::pair<double, rectangle> >& dets, const double thresh ) const; /*! requires - w.size() >= get_num_dimensions() - is_loaded_with_image() == true ensures - performs: detect(build_fhog_filterbank(w), dets, thresh) !*/ void get_feature_vector ( const full_object_detection& obj, feature_vector_type& psi ) const; /*! requires - obj.num_parts() == 0 - is_loaded_with_image() == true - psi.size() >= get_num_dimensions() (i.e. psi must have preallocated its memory before this function is called) ensures - This function allows you to determine the feature vector used for an object detection output from detect(). Note that this vector is added to psi. Note also that you can use get_full_object_detection() to convert a rectangle from detect() into the needed full_object_detection. - The dimensionality of the vector added to psi is get_num_dimensions(). This means that elements of psi after psi(get_num_dimensions()-1) are not modified. - Since scan_fhog_pyramid only searches a limited set of object locations, not all possible rectangles can be output by detect(). So in the case where obj.get_rect() could not arise from a call to detect(), this function will map obj.get_rect() to the nearest possible rectangle and then add the feature vector for the mapped rectangle into #psi. - get_best_matching_rect(obj.get_rect()) == the rectangle obj.get_rect() gets mapped to for feature extraction. !*/ full_object_detection get_full_object_detection ( const rectangle& rect, const feature_vector_type& w ) const; /*! ensures - returns full_object_detection(rect) (This function is here only for compatibility with the scan_image_pyramid object) !*/ const rectangle get_best_matching_rect ( const rectangle& rect ) const; /*! ensures - Since scan_fhog_pyramid only searches a limited set of object locations, not all possible rectangles can be represented. Therefore, this function allows you to supply a rectangle and obtain the nearest possible candidate object location rectangle. !*/ double get_nuclear_norm_regularization_strength ( ) const; /*! ensures - If the number of separable filters in a fhog_filterbank is small then the filter bank can be scanned over an image much faster than a normal set of filters. Therefore, this object provides the option to encourage machine learning methods that learn a HOG filter bank (i.e. structural_object_detection_trainer) to select filter banks that have this beneficial property. In particular, the value returned by get_nuclear_norm_regularization_strength() is a multiplier on a nuclear norm regularizer which will encourage the selection of filters that use a small number of separable components. Larger values encourage tend to give a smaller number of separable filters. - if (get_nuclear_norm_regularization_strength() == 0) then - This feature is disabled - else - A nuclear norm regularizer will be added when structural_object_detection_trainer is used to learn a HOG filter bank. Note that this can make the training process take significantly longer (but can result in faster object detectors). !*/ void set_nuclear_norm_regularization_strength ( double strength ); /*! requires - strength >= 0 ensures - #get_nuclear_norm_regularization_strength() == strength !*/ }; // ---------------------------------------------------------------------------------------- template <typename T> void serialize ( const scan_fhog_pyramid<T>& item, std::ostream& out ); /*! provides serialization support !*/ // ---------------------------------------------------------------------------------------- template <typename T> void deserialize ( scan_fhog_pyramid<T>& item, std::istream& in ); /*! provides deserialization support !*/ // ---------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------- template < typename pyramid_type, typename image_type > void evaluate_detectors ( const std::vector<object_detector<scan_fhog_pyramid<pyramid_type>>>& detectors, const image_type& img, std::vector<rect_detection>& dets, const double adjust_threshold = 0 ); /*! requires - image_type == is an implementation of array2d/array2d_kernel_abstract.h - img contains some kind of pixel type. (i.e. pixel_traits<typename image_type::type> is defined) ensures - This function runs each of the provided object_detector objects over img and stores the resulting detections into #dets. Importantly, this function is faster than running each detector individually because it computes the HOG features only once and then reuses them for each detector. However, it is important to note that this speedup is only possible if all the detectors use the same cell_size parameter that determines how HOG features are computed. If different cell_size values are used then this function will not be any faster than running the detectors individually. - This function applies non-max suppression individually to the output of each detector. Therefore, the output is the same as if you ran each detector individually and then concatenated the results. - To be precise, this function performs object detection on the given image and stores the detected objects into #dets. In particular, we will have that: - #dets is sorted such that the highest confidence detections come first. E.g. element 0 is the best detection, element 1 the next best, and so on. - #dets.size() == the number of detected objects. - #dets[i].detection_confidence == The strength of the i-th detection. Larger values indicate that the detector is more confident that #dets[i] is a correct detection rather than being a false alarm. Moreover, the detection_confidence is equal to the detection value output by the scanner minus the threshold value stored at the end of the weight vector. - #dets[i].rect == the bounding box for the i-th detection. - The detection #dets[i].rect was produced by detectors[#dets[i].weight_index]. - The detection threshold is adjusted by having adjust_threshold added to it. Therefore, an adjust_threshold value > 0 makes detecting objects harder while a negative value makes it easier. Moreover, the following will be true for all valid i: - #dets[i].detection_confidence >= adjust_threshold This means that, for example, you can obtain the maximum possible number of detections by setting adjust_threshold equal to negative infinity. - This function is threadsafe in the sense that multiple threads can call evaluate_detectors() with the same instances of detectors and img without requiring a mutex lock. !*/ // ---------------------------------------------------------------------------------------- template < typename pyramid_type, typename image_type > std::vector<rectangle> evaluate_detectors ( const std::vector<object_detector<scan_fhog_pyramid<pyramid_type>>>& detectors, const image_type& img, const double adjust_threshold = 0 ); /*! requires - image_type == is an implementation of array2d/array2d_kernel_abstract.h - img contains some kind of pixel type. (i.e. pixel_traits<typename image_type::type> is defined) ensures - This function just calls the above evaluate_detectors() routine and copies the output dets into a vector<rectangle> object and returns it. Therefore, this function is provided for convenience. - This function is threadsafe in the sense that multiple threads can call evaluate_detectors() with the same instances of detectors and img without requiring a mutex lock. !*/ // ---------------------------------------------------------------------------------------- } #endif // DLIB_SCAN_fHOG_PYRAMID_ABSTRACT_Hh_