// Copyright (C) 2013 Davis E. King (davis@dlib.net) // License: Boost Software License See LICENSE.txt for the full license. #ifndef DLIB_SCAN_IMAGE_bOXES_Hh_ #define DLIB_SCAN_IMAGE_bOXES_Hh_ #include "scan_image_boxes_abstract.h" #include "../matrix.h" #include "../geometry.h" #include "../array2d.h" #include <vector> #include "../image_processing/full_object_detection.h" #include "../image_transforms.h" namespace dlib { // ---------------------------------------------------------------------------------------- class default_box_generator { public: template <typename image_type> void operator() ( const image_type& img, std::vector<rectangle>& rects ) const { rects.clear(); find_candidate_object_locations(img, rects); } }; inline void serialize(const default_box_generator&, std::ostream& ) {} inline void deserialize(default_box_generator&, std::istream& ) {} // ---------------------------------------------------------------------------------------- template < typename Feature_extractor_type, typename Box_generator = default_box_generator > class scan_image_boxes : noncopyable { public: typedef matrix<double,0,1> feature_vector_type; typedef Feature_extractor_type feature_extractor_type; typedef Box_generator box_generator; scan_image_boxes ( ); template < typename image_type > void load ( const image_type& img ); inline bool is_loaded_with_image ( ) const; inline void copy_configuration( const feature_extractor_type& fe ); inline void copy_configuration( const box_generator& bg ); const box_generator& get_box_generator ( ) const { return detect_boxes; } const Feature_extractor_type& get_feature_extractor ( ) const { return feats; } inline void copy_configuration ( const scan_image_boxes& item ); inline long get_num_dimensions ( ) const; unsigned long get_num_spatial_pyramid_levels ( ) const; void set_num_spatial_pyramid_levels ( unsigned long levels ); void detect ( const feature_vector_type& w, std::vector<std::pair<double, rectangle> >& dets, const double thresh ) const; void get_feature_vector ( const full_object_detection& obj, feature_vector_type& psi ) const; full_object_detection get_full_object_detection ( const rectangle& rect, const feature_vector_type& w ) const; const rectangle get_best_matching_rect ( const rectangle& rect ) const; /*! requires - is_loaded_with_image() == true !*/ inline unsigned long get_num_detection_templates ( ) const { return 1; } inline unsigned long get_num_movable_components_per_detection_template ( ) const { return 0; } template <typename T, typename U> friend void serialize ( const scan_image_boxes<T,U>& item, std::ostream& out ); template <typename T, typename U> friend void deserialize ( scan_image_boxes<T,U>& item, std::istream& in ); private: static bool compare_pair_rect ( const std::pair<double, rectangle>& a, const std::pair<double, rectangle>& b ) { return a.first < b.first; } void test_coordinate_transforms() { for (long x = -10; x <= 10; x += 10) { for (long y = -10; y <= 10; y += 10) { const rectangle rect = centered_rect(x,y,5,6); rectangle a; a = feats.image_to_feat_space(rect); if (a.width() > 10000000 || a.height() > 10000000 ) { DLIB_CASSERT(false, "The image_to_feat_space() routine is outputting rectangles of an implausibly " << "\nlarge size. This means there is probably a bug in your feature extractor."); } a = feats.feat_to_image_space(rect); if (a.width() > 10000000 || a.height() > 10000000 ) { DLIB_CASSERT(false, "The feat_to_image_space() routine is outputting rectangles of an implausibly " << "\nlarge size. This means there is probably a bug in your feature extractor."); } } } } static void add_grid_rects ( std::vector<rectangle>& rects, const rectangle& object_box, unsigned int cells_x, unsigned int cells_y ) { // make sure requires clause is not broken DLIB_ASSERT(cells_x > 0 && cells_y > 0, "\t void add_grid_rects()" << "\n\t The number of cells along a dimension can't be zero. " << "\n\t cells_x: " << cells_x << "\n\t cells_y: " << cells_y ); const matrix_range_exp<double>& x = linspace(object_box.left(), object_box.right(), cells_x+1); const matrix_range_exp<double>& y = linspace(object_box.top(), object_box.bottom(), cells_y+1); for (long j = 0; j+1 < y.size(); ++j) { for (long i = 0; i+1 < x.size(); ++i) { const dlib::vector<double,2> tl(x(i),y(j)); const dlib::vector<double,2> br(x(i+1),y(j+1)); rects.push_back(rectangle(tl,br)); } } } void get_feature_extraction_regions ( const rectangle& rect, std::vector<rectangle>& regions ) const /*! ensures - #regions.size() is always the same number no matter what the input is. The regions also have a consistent ordering. - all the output rectangles are contained within rect. !*/ { regions.clear(); for (unsigned int l = 1; l <= num_spatial_pyramid_levels; ++l) { const int cells = (int)std::pow(2.0, l-1.0); add_grid_rects(regions, rect, cells, cells); } } unsigned int get_num_components_per_detection_template( ) const { return (unsigned int)(std::pow(4.0,(double)num_spatial_pyramid_levels)-1)/3; } feature_extractor_type feats; std::vector<rectangle> search_rects; bool loaded_with_image; unsigned int num_spatial_pyramid_levels; box_generator detect_boxes; const long box_sizedims; const long box_maxsize; }; // ---------------------------------------------------------------------------------------- template <typename T, typename U> void serialize ( const scan_image_boxes<T,U>& item, std::ostream& out ) { int version = 1; serialize(version, out); serialize(item.feats, out); serialize(item.search_rects, out); serialize(item.loaded_with_image, out); serialize(item.num_spatial_pyramid_levels, out); serialize(item.detect_boxes, out); serialize(item.get_num_dimensions(), out); } // ---------------------------------------------------------------------------------------- template <typename T, typename U> void deserialize ( scan_image_boxes<T,U>& item, std::istream& in ) { int version = 0; deserialize(version, in); if (version != 1) throw serialization_error("Unsupported version found when deserializing a scan_image_boxes object."); deserialize(item.feats, in); deserialize(item.search_rects, in); deserialize(item.loaded_with_image, in); deserialize(item.num_spatial_pyramid_levels, in); deserialize(item.detect_boxes, in); // When developing some feature extractor, it's easy to accidentally change its // number of dimensions and then try to deserialize data from an older version of // your extractor into the current code. This check is here to catch that kind of // user error. long dims; deserialize(dims, in); if (item.get_num_dimensions() != dims) throw serialization_error("Number of dimensions in serialized scan_image_boxes doesn't match the expected number."); } // ---------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------- // scan_image_boxes member functions // ---------------------------------------------------------------------------------------- // ---------------------------------------------------------------------------------------- template < typename Feature_extractor_type, typename Box_generator > scan_image_boxes<Feature_extractor_type,Box_generator>:: scan_image_boxes ( ) : loaded_with_image(false), num_spatial_pyramid_levels(3), box_sizedims(20), box_maxsize(1200) { } // ---------------------------------------------------------------------------------------- template < typename Feature_extractor_type, typename Box_generator > template < typename image_type > void scan_image_boxes<Feature_extractor_type,Box_generator>:: load ( const image_type& img ) { feats.load(img); detect_boxes(img, search_rects); loaded_with_image = true; } // ---------------------------------------------------------------------------------------- template < typename Feature_extractor_type, typename Box_generator > bool scan_image_boxes<Feature_extractor_type,Box_generator>:: is_loaded_with_image ( ) const { return loaded_with_image; } // ---------------------------------------------------------------------------------------- template < typename Feature_extractor_type, typename Box_generator > void scan_image_boxes<Feature_extractor_type,Box_generator>:: copy_configuration( const feature_extractor_type& fe ) { test_coordinate_transforms(); feats.copy_configuration(fe); } // ---------------------------------------------------------------------------------------- template < typename Feature_extractor_type, typename Box_generator > void scan_image_boxes<Feature_extractor_type,Box_generator>:: copy_configuration( const box_generator& bg ) { detect_boxes = bg; } // ---------------------------------------------------------------------------------------- template < typename Feature_extractor_type, typename Box_generator > void scan_image_boxes<Feature_extractor_type,Box_generator>:: copy_configuration ( const scan_image_boxes& item ) { feats.copy_configuration(item.feats); detect_boxes = item.detect_boxes; num_spatial_pyramid_levels = item.num_spatial_pyramid_levels; } // ---------------------------------------------------------------------------------------- template < typename Feature_extractor_type, typename Box_generator > unsigned long scan_image_boxes<Feature_extractor_type,Box_generator>:: get_num_spatial_pyramid_levels ( ) const { return num_spatial_pyramid_levels; } // ---------------------------------------------------------------------------------------- template < typename Feature_extractor_type, typename Box_generator > void scan_image_boxes<Feature_extractor_type,Box_generator>:: set_num_spatial_pyramid_levels ( unsigned long levels ) { // make sure requires clause is not broken DLIB_ASSERT(levels > 0, "\t void scan_image_boxes::set_num_spatial_pyramid_levels()" << "\n\t Invalid inputs were given to this function " << "\n\t levels: " << levels << "\n\t this: " << this ); num_spatial_pyramid_levels = levels; } // ---------------------------------------------------------------------------------------- template < typename Feature_extractor_type, typename Box_generator > long scan_image_boxes<Feature_extractor_type,Box_generator>:: get_num_dimensions ( ) const { return feats.get_num_dimensions()*get_num_components_per_detection_template() + box_sizedims*2; } // ---------------------------------------------------------------------------------------- template < typename Feature_extractor_type, typename Box_generator > void scan_image_boxes<Feature_extractor_type,Box_generator>:: detect ( const feature_vector_type& w, std::vector<std::pair<double, rectangle> >& dets, const double thresh ) const { // make sure requires clause is not broken DLIB_ASSERT(is_loaded_with_image() && w.size() >= get_num_dimensions(), "\t void scan_image_boxes::detect()" << "\n\t Invalid inputs were given to this function " << "\n\t is_loaded_with_image(): " << is_loaded_with_image() << "\n\t w.size(): " << w.size() << "\n\t get_num_dimensions(): " << get_num_dimensions() << "\n\t this: " << this ); dets.clear(); array<integral_image_generic<double> > saliency_images(get_num_components_per_detection_template()); array2d<double> temp_img(feats.nr(), feats.nc()); // build saliency images for (unsigned long i = 0; i < saliency_images.size(); ++i) { const unsigned long offset = 2*box_sizedims + feats.get_num_dimensions()*i; // make the basic saliency image for the i-th feature extraction region for (long r = 0; r < feats.nr(); ++r) { for (long c = 0; c < feats.nc(); ++c) { const typename feature_extractor_type::descriptor_type& descriptor = feats(r,c); double sum = 0; for (unsigned long k = 0; k < descriptor.size(); ++k) { sum += w(descriptor[k].first + offset)*descriptor[k].second; } temp_img[r][c] = sum; } } // now convert base saliency image into final integral image saliency_images[i].load(temp_img); } // now search the saliency images std::vector<rectangle> regions; const rectangle bounds = get_rect(feats); for (unsigned long i = 0; i < search_rects.size(); ++i) { const rectangle rect = feats.image_to_feat_space(search_rects[i]).intersect(bounds); if (rect.is_empty()) continue; get_feature_extraction_regions(rect, regions); double score = 0; for (unsigned long k = 0; k < regions.size(); ++k) { score += saliency_images[k].get_sum_of_area(regions[k]); } const double width = search_rects[i].width(); const double height = search_rects[i].height(); score += dot(linpiece(width, linspace(0, box_maxsize, box_sizedims+1)), rowm(w, range(0,box_sizedims-1))); score += dot(linpiece(height, linspace(0, box_maxsize, box_sizedims+1)), rowm(w, range(box_sizedims,2*box_sizedims-1))); if (score >= thresh) { dets.push_back(std::make_pair(score, search_rects[i])); } } std::sort(dets.rbegin(), dets.rend(), compare_pair_rect); } // ---------------------------------------------------------------------------------------- template < typename Feature_extractor_type, typename Box_generator > const rectangle scan_image_boxes<Feature_extractor_type,Box_generator>:: get_best_matching_rect ( const rectangle& rect ) const { // make sure requires clause is not broken DLIB_ASSERT(is_loaded_with_image(), "\t const rectangle scan_image_boxes::get_best_matching_rect()" << "\n\t Invalid inputs were given to this function " << "\n\t is_loaded_with_image(): " << is_loaded_with_image() << "\n\t this: " << this ); double best_score = -1; rectangle best_rect; for (unsigned long i = 0; i < search_rects.size(); ++i) { const double score = (rect.intersect(search_rects[i])).area()/(double)(rect+search_rects[i]).area(); if (score > best_score) { best_score = score; best_rect = search_rects[i]; } } return best_rect; } // ---------------------------------------------------------------------------------------- template < typename Feature_extractor_type, typename Box_generator > full_object_detection scan_image_boxes<Feature_extractor_type,Box_generator>:: get_full_object_detection ( const rectangle& rect, const feature_vector_type& /*w*/ ) const { return full_object_detection(rect); } // ---------------------------------------------------------------------------------------- template < typename Feature_extractor_type, typename Box_generator > void scan_image_boxes<Feature_extractor_type,Box_generator>:: get_feature_vector ( const full_object_detection& obj, feature_vector_type& psi ) const { // make sure requires clause is not broken DLIB_ASSERT(is_loaded_with_image() && psi.size() >= get_num_dimensions() && obj.num_parts() == 0, "\t void scan_image_boxes::get_feature_vector()" << "\n\t Invalid inputs were given to this function " << "\n\t is_loaded_with_image(): " << is_loaded_with_image() << "\n\t psi.size(): " << psi.size() << "\n\t get_num_dimensions(): " << get_num_dimensions() << "\n\t obj.num_parts(): " << obj.num_parts() << "\n\t this: " << this ); const rectangle best_rect = get_best_matching_rect(obj.get_rect()); const rectangle mapped_rect = feats.image_to_feat_space(best_rect).intersect(get_rect(feats)); if (mapped_rect.is_empty()) return; std::vector<rectangle> regions; get_feature_extraction_regions(mapped_rect, regions); // pull features out of all the boxes in regions. for (unsigned long j = 0; j < regions.size(); ++j) { const rectangle rect = regions[j]; const unsigned long template_region_id = j; const unsigned long offset = box_sizedims*2 + feats.get_num_dimensions()*template_region_id; for (long r = rect.top(); r <= rect.bottom(); ++r) { for (long c = rect.left(); c <= rect.right(); ++c) { const typename feature_extractor_type::descriptor_type& descriptor = feats(r,c); for (unsigned long k = 0; k < descriptor.size(); ++k) { psi(descriptor[k].first + offset) += descriptor[k].second; } } } } const double width = best_rect.width(); const double height = best_rect.height(); set_rowm(psi, range(0,box_sizedims-1)) += linpiece(width, linspace(0, box_maxsize, box_sizedims+1)); set_rowm(psi, range(box_sizedims,box_sizedims*2-1)) += linpiece(height, linspace(0, box_maxsize, box_sizedims+1)); } // ---------------------------------------------------------------------------------------- } #endif // DLIB_SCAN_IMAGE_bOXES_Hh_