// Copyright (C) 2010 Davis E. King (davis@dlib.net) // License: Boost Software License See LICENSE.txt for the full license. #ifndef DLIB_RANDOM_SUBSeT_SELECTOR_H_ #define DLIB_RANDOM_SUBSeT_SELECTOR_H_ #include "random_subset_selector_abstract.h" #include "../rand.h" #include <vector> #include "../algs.h" #include "../string.h" #include "../serialize.h" #include "../matrix/matrix_mat.h" #include <iostream> namespace dlib { template < typename T, typename Rand_type = dlib::rand > class random_subset_selector { /*! INITIAL VALUE - _max_size == 0 - items.size() == 0 - count == 0 - _next_add_accepts == false CONVENTION - count == the number of times add() has been called since the last time this object was empty. - items.size() == size() - max_size() == _max_size - next_add_accepts() == _next_add_accepts !*/ public: typedef T type; typedef T value_type; typedef default_memory_manager mem_manager_type; typedef Rand_type rand_type; typedef typename std::vector<T>::iterator iterator; typedef typename std::vector<T>::const_iterator const_iterator; random_subset_selector ( ) { _max_size = 0; make_empty(); } void set_seed(const std::string& value) { rnd.set_seed(value); } void make_empty ( ) { items.resize(0); count = 0; update_next_add_accepts(); } const std::vector<T>& to_std_vector( ) const { return items; } unsigned long size ( ) const { return items.size(); } void set_max_size ( unsigned long new_max_size ) { items.reserve(new_max_size); make_empty(); _max_size = new_max_size; update_next_add_accepts(); } unsigned long max_size ( ) const { return _max_size; } T& operator[] ( unsigned long idx ) { // make sure requires clause is not broken DLIB_ASSERT(idx < size(), "\tvoid random_subset_selector::operator[]()" << "\n\t idx is out of range" << "\n\t idx: " << idx << "\n\t size(): " << size() << "\n\t this: " << this ); return items[idx]; } const T& operator[] ( unsigned long idx ) const { // make sure requires clause is not broken DLIB_ASSERT(idx < size(), "\tvoid random_subset_selector::operator[]()" << "\n\t idx is out of range" << "\n\t idx: " << idx << "\n\t size(): " << size() << "\n\t this: " << this ); return items[idx]; } iterator begin() { return items.begin(); } const_iterator begin() const { return items.begin(); } iterator end() { return items.end(); } const_iterator end() const { return items.end(); } bool next_add_accepts ( ) const { return _next_add_accepts; } void add ( const T& new_item ) { if (items.size() < _max_size) { items.push_back(new_item); // swap into a random place exchange(items[rnd.get_random_32bit_number()%items.size()], items.back()); } else if (_next_add_accepts) { // pick a random element of items and replace it. items[rnd.get_random_32bit_number()%items.size()] = new_item; } update_next_add_accepts(); ++count; } void add ( ) { // make sure requires clause is not broken DLIB_ASSERT(next_add_accepts() == false, "\tvoid random_subset_selector::add()" << "\n\t You should be calling the version of add() that takes an argument" << "\n\t this: " << this ); update_next_add_accepts(); ++count; } void swap ( random_subset_selector& a ) { items.swap(a.items); std::swap(_max_size, a._max_size); std::swap(count, a.count); rnd.swap(a.rnd); std::swap(_next_add_accepts, a._next_add_accepts); } template <typename T1, typename T2> friend void serialize ( const random_subset_selector<T1,T2>& item, std::ostream& out ); template <typename T1, typename T2> friend void deserialize ( random_subset_selector<T1,T2>& item, std::istream& in ); private: void update_next_add_accepts ( ) { if (items.size() < _max_size) { _next_add_accepts = true; } else if (_max_size == 0) { _next_add_accepts = false; } else { // At this point each element of items has had an equal chance of being in this object. // In particular, the probability that each arrived here is currently items.size()/count. // We need to be able to say that, after this function ends, the probability of any // particular object ending up in items is items.size()/(count+1). So this means that // we should decide to add a new item into items with this probability. Also, if we do // so then we pick one of the current items and replace it at random with the new item. // Make me a random 64 bit number. This might seem excessive but I want this object // to be able to handle an effectively infinite number of calls to add(). So count // might get very large and we need to deal with that properly. const unsigned long num1 = rnd.get_random_32bit_number(); const unsigned long num2 = rnd.get_random_32bit_number(); uint64 num = num1; num <<= 32; num |= num2; num %= (count+1); _next_add_accepts = (num < items.size()); } } std::vector<T> items; unsigned long _max_size; uint64 count; rand_type rnd; bool _next_add_accepts; }; template < typename T, typename rand_type > void swap ( random_subset_selector<T,rand_type>& a, random_subset_selector<T,rand_type>& b ) { a.swap(b); } // ---------------------------------------------------------------------------------------- template <typename T1, typename T2> void serialize ( const random_subset_selector<T1,T2>& item, std::ostream& out ) { serialize(item.items, out); serialize(item._max_size, out); serialize(item.count, out); serialize(item.rnd, out); serialize(item._next_add_accepts, out); } template <typename T1, typename T2> void deserialize ( random_subset_selector<T1,T2>& item, std::istream& in ) { deserialize(item.items, in); deserialize(item._max_size, in); deserialize(item.count, in); deserialize(item.rnd, in); deserialize(item._next_add_accepts, in); } // ---------------------------------------------------------------------------------------- template < typename T, typename alloc > random_subset_selector<T> randomly_subsample ( const std::vector<T,alloc>& samples, unsigned long num ) { random_subset_selector<T> subset; subset.set_max_size(num); for (unsigned long i = 0; i < samples.size(); ++i) subset.add(samples[i]); return subset; } // ---------------------------------------------------------------------------------------- template < typename T, typename alloc, typename U > random_subset_selector<T> randomly_subsample ( const std::vector<T,alloc>& samples, unsigned long num, const U& random_seed ) { random_subset_selector<T> subset; subset.set_seed(cast_to_string(random_seed)); subset.set_max_size(num); for (unsigned long i = 0; i < samples.size(); ++i) subset.add(samples[i]); return subset; } // ---------------------------------------------------------------------------------------- template < typename T > random_subset_selector<T> randomly_subsample ( const random_subset_selector<T>& samples, unsigned long num ) { random_subset_selector<T> subset; subset.set_max_size(num); for (unsigned long i = 0; i < samples.size(); ++i) subset.add(samples[i]); return subset; } // ---------------------------------------------------------------------------------------- template < typename T, typename U > random_subset_selector<T> randomly_subsample ( const random_subset_selector<T>& samples, unsigned long num, const U& random_seed ) { random_subset_selector<T> subset; subset.set_seed(cast_to_string(random_seed)); subset.set_max_size(num); for (unsigned long i = 0; i < samples.size(); ++i) subset.add(samples[i]); return subset; } // ---------------------------------------------------------------------------------------- template < typename T > const matrix_op<op_array_to_mat<random_subset_selector<T> > > mat ( const random_subset_selector<T>& m ) { typedef op_array_to_mat<random_subset_selector<T> > op; return matrix_op<op>(op(m)); } // ---------------------------------------------------------------------------------------- } #endif // DLIB_RANDOM_SUBSeT_SELECTOR_H_