dlib C++ Library - scan_image_boxes

// Copyright (C) 2013  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.
#undef DLIB_SCAN_IMAGE_bOXES_ABSTRACT_Hh_
#ifdef DLIB_SCAN_IMAGE_bOXES_ABSTRACT_Hh_

#include "../matrix.h"
#include "../geometry.h"
#include "../image_processing.h"
#include "../array2d.h"
#include "full_object_detection_abstract.h"
#include "../image_transforms/segment_image_abstract.h"
#include <vector>

namespace dlib
{

// ----------------------------------------------------------------------------------------

    class default_box_generator
    {
        /*!
            WHAT THIS OBJECT REPRESENTS
                This is a function object that takes in an image and outputs a set of
                candidate object locations.  It is also the default box generator used by
                the scan_image_boxes object defined below.
        !*/

    public:

        template <typename image_type>
        void operator() (
            const image_type& img,
            std::vector<rectangle>& rects
        ) const
        /*!
            ensures
                - #rects == the set of candidate object locations which should be searched
                  inside img.  That is, these are the rectangles which might contain
                  objects of interest within the given image.
        !*/
        {
            rects.clear();
            find_candidate_object_locations(img, rects);
        }
    };

    inline void serialize  (const default_box_generator&, std::ostream& ) {}
    inline void deserialize(      default_box_generator&, std::istream& ) {}
    /*!
        ensures
            - provides serialization support.  
    !*/

// ----------------------------------------------------------------------------------------

    template <
        typename Feature_extractor_type,
        typename Box_generator = default_box_generator
        >
    class scan_image_boxes : noncopyable
    {
        /*!
            REQUIREMENTS ON Feature_extractor_type
                - must be an object with an interface compatible with the hashed_feature_image 
                  object defined in dlib/image_keypoint/hashed_feature_image_abstract.h or 
                  with the nearest_neighbor_feature_image object defined in 
                  dlib/image_keypoint/nearest_neighbor_feature_image_abstract.h

            REQUIREMENTS ON Box_generator
                - must be an object with an interface compatible with the
                  default_box_generator object defined at the top of this file.

            INITIAL VALUE
                - get_num_spatial_pyramid_levels() == 3
                - is_loaded_with_image() == false

            WHAT THIS OBJECT REPRESENTS
                This object is a tool for running a classifier over an image with the goal
                of localizing each object present.  The localization is in the form of the
                bounding box around each object of interest.  

                Unlike the scan_image_pyramid object which scans a fixed sized window over
                an image pyramid, the scan_image_boxes tool allows you to define your own
                list of "candidate object locations" which should be evaluated.  This is
                simply a list of rectangle objects which might contain objects of interest.
                The scan_image_boxes object will then evaluate the classifier at each of
                these locations and return the subset of rectangles which appear to have
                objects in them.  The candidate object location generation is provided by
                the Box_generator that is passed in as a template argument.  

                This object can also be understood as a general tool for implementing the
                spatial pyramid models described in the paper:
                    Beyond Bags of Features: Spatial Pyramid Matching for Recognizing 
                    Natural Scene Categories by Svetlana Lazebnik, Cordelia Schmid, 
                    and Jean Ponce


                The classifiers used by this object have three parts: 
                   1. The underlying feature extraction provided by Feature_extractor_type
                      objects, which associate a vector with each location in an image.

                   2. A rule for extracting a feature vector from a candidate object
                      location.  In this object we use the spatial pyramid matching method.
                      This means we cut an object's detection window into a set of "feature
                      extraction regions" and extract a bag-of-words vector from each
                      before finally concatenating them to form the final feature vector
                      representing the entire object window.  The set of feature extraction
                      regions can be configured by the user by calling
                      set_num_spatial_pyramid_levels().  To be a little more precise, the
                      feature vector for a candidate object window is defined as follows:
                        - Let N denote the number of feature extraction zones.
                        - Let M denote the dimensionality of the vectors output by
                          Feature_extractor_type objects.
                        - Let F(i) == the M dimensional vector which is the sum of all
                          vectors given by our Feature_extractor_type object inside the
                          i-th feature extraction zone.  So this is notionally a
                          bag-of-words vector from the i-th zone.
                        - Then the feature vector for an object window is an M*N
                          dimensional vector [F(1) F(2) F(3) ... F(N)] (i.e. it is a
                          concatenation of the N vectors).  This feature vector can be
                          thought of as a collection of N bags-of-words, each bag coming
                          from a spatial location determined by one of the feature
                          extraction zones.
                          
                   3. A weight vector and a threshold value.  The dot product between the
                      weight vector and the feature vector for a candidate object location
                      gives the score of the location.  If this score is greater than the
                      threshold value then the candidate object location is output as a
                      detection.

            THREAD SAFETY
                Concurrent access to an instance of this object is not safe and should be
                protected by a mutex lock except for the case where you are copying the
                configuration (via copy_configuration()) of a scan_image_boxes object to
                many other threads.  In this case, it is safe to copy the configuration of
                a shared object so long as no other operations are performed on it.
        !*/

    public:

        typedef matrix<double,0,1> feature_vector_type;

        typedef Feature_extractor_type feature_extractor_type;
        typedef Box_generator box_generator;

        scan_image_boxes (
        );  
        /*!
            ensures
                - this object is properly initialized
        !*/

        template <
            typename image_type
            >
        void load (
            const image_type& img
        );
        /*!
            requires
                - image_type must be a type with the following properties:
                    - image_type objects can be loaded into Feature_extractor_type
                      objects via Feature_extractor_type::load().
                    - image_type objects can be passed to the first argument of
                      Box_generator::operator()
            ensures
                - #is_loaded_with_image() == true
                - This object is ready to run a classifier over img to detect object
                  locations.  Call detect() to do this.
        !*/

        bool is_loaded_with_image (
        ) const;
        /*!
            ensures
                - returns true if this object has been loaded with an image to process and
                  false otherwise.
        !*/

        const feature_extractor_type& get_feature_extractor (
        ) const;
        /*!
            ensures
                - returns a const reference to the feature_extractor_type object used 
                  internally for local feature extraction.  
        !*/

        void copy_configuration(
            const feature_extractor_type& fe
        );
        /*!
            ensures
                - This function performs the equivalent of
                  get_feature_extractor().copy_configuration(fe) (i.e. this function allows
                  you to configure the parameters of the underlying feature extractor used
                  by a scan_image_boxes object)
        !*/

        void copy_configuration(
            const box_generator& bg
        );
        /*!
            ensures
                - #get_box_generator() == bg
                  (i.e. this function allows you to configure the parameters of the
                  underlying box generator used by a scan_image_boxes object)
        !*/

        const box_generator& get_box_generator (
        ) const;
        /*!
            ensures
                - returns the box_generator used by this object to generate candidate
                  object locations.
        !*/

        void copy_configuration (
            const scan_image_boxes& item
        );
        /*!
            ensures
                - Copies all the state information of item into *this, except for state 
                  information populated by load().  More precisely, given two scan_image_boxes 
                  objects S1 and S2, the following sequence of instructions should always 
                  result in both of them having the exact same state:
                    S2.copy_configuration(S1);
                    S1.load(img);
                    S2.load(img);
        !*/

        long get_num_dimensions (
        ) const;
        /*!
            ensures
                - returns the number of dimensions in the feature vector for a candidate
                  object location.  This value is the dimensionality of the underlying
                  feature vectors produced by Feature_extractor_type times the number of
                  feature extraction regions used.  Note that the number of feature
                  extraction regions used is a function of
                  get_num_spatial_pyramid_levels().
        !*/

        unsigned long get_num_spatial_pyramid_levels (
        ) const;
        /*!
            ensures
                - returns the number of layers in the spatial pyramid.  For example, if
                  this function returns 1 then it means we use a simple bag-of-words
                  representation over the whole object window.  If it returns 2 then it
                  means the feature representation is the concatenation of 5 bag-of-words
                  vectors, one from the entire object window and 4 others from 4 different
                  parts of the object window.  If it returns 3 then there are 1+4+16
                  bag-of-words vectors concatenated together in the feature representation,
                  and so on.
        !*/

        void set_num_spatial_pyramid_levels (
            unsigned long levels
        );
        /*!
            requires
                - levels > 0
            ensures
                - #get_num_spatial_pyramid_levels() == levels
        !*/

        void detect (
            const feature_vector_type& w,
            std::vector<std::pair<double, rectangle> >& dets,
            const double thresh
        ) const;
        /*!
            requires
                - w.size() >= get_num_dimensions()
                - is_loaded_with_image() == true
            ensures
                - Scans over all the candidate object locations as discussed in the WHAT
                  THIS OBJECT REPRESENTS section and stores all detections into #dets.
                - for all valid i:
                    - #dets[i].second == The candidate object location which produced this
                      detection.  This rectangle gives the location of the detection.  
                    - #dets[i].first == The score for this detection.  This value is equal
                      to dot(w, feature vector for this candidate object location).
                    - #dets[i].first >= thresh
                - #dets will be sorted in descending order. 
                  (i.e.  #dets[i].first >= #dets[j].first for all i, and j>i)
                - Elements of w beyond index get_num_dimensions()-1 are ignored.  I.e. only
                  the first get_num_dimensions() are used.
                - Note that no form of non-max suppression is performed.  If a locations
                  has a score >= thresh then it is reported in #dets.
        !*/

        void get_feature_vector (
            const full_object_detection& obj,
            feature_vector_type& psi
        ) const;
        /*!
            requires
                - obj.num_parts() == 0 
                - is_loaded_with_image() == true
                - psi.size() >= get_num_dimensions()
                  (i.e. psi must have preallocated its memory before this function is called)
            ensures
                - This function allows you to determine the feature vector used for a
                  candidate object location output from detect().  Note that this vector is
                  added to psi.  Note also that you must use get_full_object_detection() to
                  convert a rectangle from detect() into the needed full_object_detection.
                - The dimensionality of the vector added to psi is get_num_dimensions().  This
                  means that elements of psi after psi(get_num_dimensions()-1) are not modified.
                - Since scan_image_boxes only searches a limited set of object locations,
                  not all possible rectangles can be output by detect().  So in the case
                  where obj.get_rect() could not arise from a call to detect(), this
                  function will map obj.get_rect() to the nearest possible rectangle and
                  then add the feature vector for the mapped rectangle into #psi.
                - get_best_matching_rect(obj.get_rect()) == the rectangle obj.get_rect()
                  gets mapped to for feature extraction.
        !*/

        full_object_detection get_full_object_detection (
            const rectangle& rect,
            const feature_vector_type& w
        ) const;
        /*!
            ensures
                - returns full_object_detection(rect)
                  (This function is here only for compatibility with the scan_image_pyramid
                  object)
        !*/

        const rectangle get_best_matching_rect (
            const rectangle& rect
        ) const;
        /*!
            requires
                - is_loaded_with_image() == true
            ensures
                - Since scan_image_boxes only searches a limited set of object locations,
                  not all possible rectangles can be represented.  Therefore, this function
                  allows you to supply a rectangle and obtain the nearest possible
                  candidate object location rectangle.
        !*/

        unsigned long get_num_detection_templates (
        ) const { return 1; }
        /*!
            ensures
                - returns 1.  Note that this function is here only for compatibility with 
                  the scan_image_pyramid object.  Notionally, its return value indicates 
                  that a scan_image_boxes object is always ready to detect objects once
                  an image has been loaded.
        !*/

        unsigned long get_num_movable_components_per_detection_template (
        ) const { return 0; }
        /*!
            ensures
                - returns 0.  Note that this function is here only for compatibility with
                  the scan_image_pyramid object.  Its return value means that this object
                  does not support using movable part models.
        !*/
    };

// ----------------------------------------------------------------------------------------

    template <
        typename Feature_extractor_type,
        typename Box_generator 
        >
    void serialize (
        const scan_image_boxes<Feature_extractor_type,Box_generator>& item,
        std::ostream& out
    );
    /*!
        provides serialization support 
    !*/

    template <
        typename Feature_extractor_type,
        typename Box_generator 
        >
    void deserialize (
        scan_image_boxes<Feature_extractor_type,Box_generator>& item,
        std::istream& in 
    );
    /*!
        provides deserialization support 
    !*/

// ----------------------------------------------------------------------------------------

}

#endif // DLIB_SCAN_IMAGE_bOXES_ABSTRACT_Hh_