// Copyright (C) 2013 Davis E. King (davis@dlib.net) // License: Boost Software License See LICENSE.txt for the full license. #undef DLIB_SCAN_IMAGE_bOXES_ABSTRACT_Hh_ #ifdef DLIB_SCAN_IMAGE_bOXES_ABSTRACT_Hh_ #include "../matrix.h" #include "../geometry.h" #include "../image_processing.h" #include "../array2d.h" #include "full_object_detection_abstract.h" #include "../image_transforms/segment_image_abstract.h" #include <vector> namespace dlib { // ---------------------------------------------------------------------------------------- class default_box_generator { /*! WHAT THIS OBJECT REPRESENTS This is a function object that takes in an image and outputs a set of candidate object locations. It is also the default box generator used by the scan_image_boxes object defined below. !*/ public: template <typename image_type> void operator() ( const image_type& img, std::vector<rectangle>& rects ) const /*! ensures - #rects == the set of candidate object locations which should be searched inside img. That is, these are the rectangles which might contain objects of interest within the given image. !*/ { rects.clear(); find_candidate_object_locations(img, rects); } }; inline void serialize (const default_box_generator&, std::ostream& ) {} inline void deserialize( default_box_generator&, std::istream& ) {} /*! ensures - provides serialization support. !*/ // ---------------------------------------------------------------------------------------- template < typename Feature_extractor_type, typename Box_generator = default_box_generator > class scan_image_boxes : noncopyable { /*! REQUIREMENTS ON Feature_extractor_type - must be an object with an interface compatible with the hashed_feature_image object defined in dlib/image_keypoint/hashed_feature_image_abstract.h or with the nearest_neighbor_feature_image object defined in dlib/image_keypoint/nearest_neighbor_feature_image_abstract.h REQUIREMENTS ON Box_generator - must be an object with an interface compatible with the default_box_generator object defined at the top of this file. INITIAL VALUE - get_num_spatial_pyramid_levels() == 3 - is_loaded_with_image() == false WHAT THIS OBJECT REPRESENTS This object is a tool for running a classifier over an image with the goal of localizing each object present. The localization is in the form of the bounding box around each object of interest. Unlike the scan_image_pyramid object which scans a fixed sized window over an image pyramid, the scan_image_boxes tool allows you to define your own list of "candidate object locations" which should be evaluated. This is simply a list of rectangle objects which might contain objects of interest. The scan_image_boxes object will then evaluate the classifier at each of these locations and return the subset of rectangles which appear to have objects in them. The candidate object location generation is provided by the Box_generator that is passed in as a template argument. This object can also be understood as a general tool for implementing the spatial pyramid models described in the paper: Beyond Bags of Features: Spatial Pyramid Matching for Recognizing Natural Scene Categories by Svetlana Lazebnik, Cordelia Schmid, and Jean Ponce The classifiers used by this object have three parts: 1. The underlying feature extraction provided by Feature_extractor_type objects, which associate a vector with each location in an image. 2. A rule for extracting a feature vector from a candidate object location. In this object we use the spatial pyramid matching method. This means we cut an object's detection window into a set of "feature extraction regions" and extract a bag-of-words vector from each before finally concatenating them to form the final feature vector representing the entire object window. The set of feature extraction regions can be configured by the user by calling set_num_spatial_pyramid_levels(). To be a little more precise, the feature vector for a candidate object window is defined as follows: - Let N denote the number of feature extraction zones. - Let M denote the dimensionality of the vectors output by Feature_extractor_type objects. - Let F(i) == the M dimensional vector which is the sum of all vectors given by our Feature_extractor_type object inside the i-th feature extraction zone. So this is notionally a bag-of-words vector from the i-th zone. - Then the feature vector for an object window is an M*N dimensional vector [F(1) F(2) F(3) ... F(N)] (i.e. it is a concatenation of the N vectors). This feature vector can be thought of as a collection of N bags-of-words, each bag coming from a spatial location determined by one of the feature extraction zones. 3. A weight vector and a threshold value. The dot product between the weight vector and the feature vector for a candidate object location gives the score of the location. If this score is greater than the threshold value then the candidate object location is output as a detection. THREAD SAFETY Concurrent access to an instance of this object is not safe and should be protected by a mutex lock except for the case where you are copying the configuration (via copy_configuration()) of a scan_image_boxes object to many other threads. In this case, it is safe to copy the configuration of a shared object so long as no other operations are performed on it. !*/ public: typedef matrix<double,0,1> feature_vector_type; typedef Feature_extractor_type feature_extractor_type; typedef Box_generator box_generator; scan_image_boxes ( ); /*! ensures - this object is properly initialized !*/ template < typename image_type > void load ( const image_type& img ); /*! requires - image_type must be a type with the following properties: - image_type objects can be loaded into Feature_extractor_type objects via Feature_extractor_type::load(). - image_type objects can be passed to the first argument of Box_generator::operator() ensures - #is_loaded_with_image() == true - This object is ready to run a classifier over img to detect object locations. Call detect() to do this. !*/ bool is_loaded_with_image ( ) const; /*! ensures - returns true if this object has been loaded with an image to process and false otherwise. !*/ const feature_extractor_type& get_feature_extractor ( ) const; /*! ensures - returns a const reference to the feature_extractor_type object used internally for local feature extraction. !*/ void copy_configuration( const feature_extractor_type& fe ); /*! ensures - This function performs the equivalent of get_feature_extractor().copy_configuration(fe) (i.e. this function allows you to configure the parameters of the underlying feature extractor used by a scan_image_boxes object) !*/ void copy_configuration( const box_generator& bg ); /*! ensures - #get_box_generator() == bg (i.e. this function allows you to configure the parameters of the underlying box generator used by a scan_image_boxes object) !*/ const box_generator& get_box_generator ( ) const; /*! ensures - returns the box_generator used by this object to generate candidate object locations. !*/ void copy_configuration ( const scan_image_boxes& item ); /*! ensures - Copies all the state information of item into *this, except for state information populated by load(). More precisely, given two scan_image_boxes objects S1 and S2, the following sequence of instructions should always result in both of them having the exact same state: S2.copy_configuration(S1); S1.load(img); S2.load(img); !*/ long get_num_dimensions ( ) const; /*! ensures - returns the number of dimensions in the feature vector for a candidate object location. This value is the dimensionality of the underlying feature vectors produced by Feature_extractor_type times the number of feature extraction regions used. Note that the number of feature extraction regions used is a function of get_num_spatial_pyramid_levels(). !*/ unsigned long get_num_spatial_pyramid_levels ( ) const; /*! ensures - returns the number of layers in the spatial pyramid. For example, if this function returns 1 then it means we use a simple bag-of-words representation over the whole object window. If it returns 2 then it means the feature representation is the concatenation of 5 bag-of-words vectors, one from the entire object window and 4 others from 4 different parts of the object window. If it returns 3 then there are 1+4+16 bag-of-words vectors concatenated together in the feature representation, and so on. !*/ void set_num_spatial_pyramid_levels ( unsigned long levels ); /*! requires - levels > 0 ensures - #get_num_spatial_pyramid_levels() == levels !*/ void detect ( const feature_vector_type& w, std::vector<std::pair<double, rectangle> >& dets, const double thresh ) const; /*! requires - w.size() >= get_num_dimensions() - is_loaded_with_image() == true ensures - Scans over all the candidate object locations as discussed in the WHAT THIS OBJECT REPRESENTS section and stores all detections into #dets. - for all valid i: - #dets[i].second == The candidate object location which produced this detection. This rectangle gives the location of the detection. - #dets[i].first == The score for this detection. This value is equal to dot(w, feature vector for this candidate object location). - #dets[i].first >= thresh - #dets will be sorted in descending order. (i.e. #dets[i].first >= #dets[j].first for all i, and j>i) - Elements of w beyond index get_num_dimensions()-1 are ignored. I.e. only the first get_num_dimensions() are used. - Note that no form of non-max suppression is performed. If a locations has a score >= thresh then it is reported in #dets. !*/ void get_feature_vector ( const full_object_detection& obj, feature_vector_type& psi ) const; /*! requires - obj.num_parts() == 0 - is_loaded_with_image() == true - psi.size() >= get_num_dimensions() (i.e. psi must have preallocated its memory before this function is called) ensures - This function allows you to determine the feature vector used for a candidate object location output from detect(). Note that this vector is added to psi. Note also that you must use get_full_object_detection() to convert a rectangle from detect() into the needed full_object_detection. - The dimensionality of the vector added to psi is get_num_dimensions(). This means that elements of psi after psi(get_num_dimensions()-1) are not modified. - Since scan_image_boxes only searches a limited set of object locations, not all possible rectangles can be output by detect(). So in the case where obj.get_rect() could not arise from a call to detect(), this function will map obj.get_rect() to the nearest possible rectangle and then add the feature vector for the mapped rectangle into #psi. - get_best_matching_rect(obj.get_rect()) == the rectangle obj.get_rect() gets mapped to for feature extraction. !*/ full_object_detection get_full_object_detection ( const rectangle& rect, const feature_vector_type& w ) const; /*! ensures - returns full_object_detection(rect) (This function is here only for compatibility with the scan_image_pyramid object) !*/ const rectangle get_best_matching_rect ( const rectangle& rect ) const; /*! requires - is_loaded_with_image() == true ensures - Since scan_image_boxes only searches a limited set of object locations, not all possible rectangles can be represented. Therefore, this function allows you to supply a rectangle and obtain the nearest possible candidate object location rectangle. !*/ unsigned long get_num_detection_templates ( ) const { return 1; } /*! ensures - returns 1. Note that this function is here only for compatibility with the scan_image_pyramid object. Notionally, its return value indicates that a scan_image_boxes object is always ready to detect objects once an image has been loaded. !*/ unsigned long get_num_movable_components_per_detection_template ( ) const { return 0; } /*! ensures - returns 0. Note that this function is here only for compatibility with the scan_image_pyramid object. Its return value means that this object does not support using movable part models. !*/ }; // ---------------------------------------------------------------------------------------- template < typename Feature_extractor_type, typename Box_generator > void serialize ( const scan_image_boxes<Feature_extractor_type,Box_generator>& item, std::ostream& out ); /*! provides serialization support !*/ template < typename Feature_extractor_type, typename Box_generator > void deserialize ( scan_image_boxes<Feature_extractor_type,Box_generator>& item, std::istream& in ); /*! provides deserialization support !*/ // ---------------------------------------------------------------------------------------- } #endif // DLIB_SCAN_IMAGE_bOXES_ABSTRACT_Hh_