// Copyright (C) 2012 Davis E. King (davis@dlib.net) // License: Boost Software License See LICENSE.txt for the full license. #ifndef DLIB_SVm_C_LINEAR_DCD_TRAINER_Hh_ #define DLIB_SVm_C_LINEAR_DCD_TRAINER_Hh_ #include "svm_c_linear_dcd_trainer_abstract.h" #include <cmath> #include <limits> #include "../matrix.h" #include "../algs.h" #include "../rand.h" #include "function.h" #include "kernel.h" namespace dlib { // ---------------------------------------------------------------------------------------- template < typename K > class svm_c_linear_dcd_trainer { public: typedef K kernel_type; typedef typename kernel_type::scalar_type scalar_type; typedef typename kernel_type::sample_type sample_type; typedef typename kernel_type::mem_manager_type mem_manager_type; typedef decision_function<kernel_type> trained_function_type; typedef typename decision_function<K>::sample_vector_type sample_vector_type; typedef typename decision_function<K>::scalar_vector_type scalar_vector_type; // You are getting a compiler error on this line because you supplied a non-linear // kernel to the svm_c_linear_dcd_trainer object. You have to use one of the // linear kernels with this trainer. COMPILE_TIME_ASSERT((is_same_type<K, linear_kernel<sample_type> >::value || is_same_type<K, sparse_linear_kernel<sample_type> >::value )); svm_c_linear_dcd_trainer ( ) : Cpos(1), Cneg(1), eps(0.1), max_iterations(10000), verbose(false), have_bias(true), last_weight_1(false), do_shrinking(true), do_svm_l2(false) { } explicit svm_c_linear_dcd_trainer ( const scalar_type& C_ ) : Cpos(C_), Cneg(C_), eps(0.1), max_iterations(10000), verbose(false), have_bias(true), last_weight_1(false), do_shrinking(true), do_svm_l2(false) { // make sure requires clause is not broken DLIB_ASSERT(0 < C_, "\tsvm_c_trainer::svm_c_linear_dcd_trainer(kernel,C)" << "\n\t invalid inputs were given to this function" << "\n\t C_: " << C_ ); } bool includes_bias ( ) const { return have_bias; } void include_bias ( bool should_have_bias ) { have_bias = should_have_bias; } bool forces_last_weight_to_1 ( ) const { return last_weight_1; } void force_last_weight_to_1 ( bool should_last_weight_be_1 ) { last_weight_1 = should_last_weight_be_1; } bool shrinking_enabled ( ) const { return do_shrinking; } void enable_shrinking ( bool enabled ) { do_shrinking = enabled; } bool solving_svm_l2_problem ( ) const { return do_svm_l2; } void solve_svm_l2_problem ( bool enabled ) { do_svm_l2 = enabled; } void be_verbose ( ) { verbose = true; } void be_quiet ( ) { verbose = false; } void set_epsilon ( scalar_type eps_ ) { // make sure requires clause is not broken DLIB_ASSERT(eps_ > 0, "\tvoid svm_c_linear_dcd_trainer::set_epsilon(eps_)" << "\n\t invalid inputs were given to this function" << "\n\t eps_: " << eps_ ); eps = eps_; } const scalar_type get_epsilon ( ) const { return eps; } const kernel_type& get_kernel ( ) const { return kernel_type(); } unsigned long get_max_iterations ( ) const { return max_iterations; } void set_max_iterations ( unsigned long max_iter ) { max_iterations = max_iter; } void set_c ( scalar_type C ) { // make sure requires clause is not broken DLIB_ASSERT(C > 0, "\t void svm_c_linear_dcd_trainer::set_c()" << "\n\t C must be greater than 0" << "\n\t C: " << C << "\n\t this: " << this ); Cpos = C; Cneg = C; } const scalar_type get_c_class1 ( ) const { return Cpos; } const scalar_type get_c_class2 ( ) const { return Cneg; } void set_c_class1 ( scalar_type C ) { // make sure requires clause is not broken DLIB_ASSERT(C > 0, "\t void svm_c_linear_dcd_trainer::set_c_class1()" << "\n\t C must be greater than 0" << "\n\t C: " << C << "\n\t this: " << this ); Cpos = C; } void set_c_class2 ( scalar_type C ) { // make sure requires clause is not broken DLIB_ASSERT(C > 0, "\t void svm_c_linear_dcd_trainer::set_c_class2()" << "\n\t C must be greater than 0" << "\n\t C: " << C << "\n\t this: " << this ); Cneg = C; } class optimizer_state { friend class svm_c_linear_dcd_trainer; public: optimizer_state() : did_init(false) {} private: template < typename in_sample_vector_type, typename in_scalar_vector_type > void init( const in_sample_vector_type& x, const in_scalar_vector_type& y, bool have_bias_, bool last_weight_1_, bool do_svm_l2_, scalar_type Cpos, scalar_type Cneg ) { const long new_dims = max_index_plus_one(x); long new_idx = 0; if (did_init) { DLIB_CASSERT(have_bias_ == have_bias && last_weight_1_ == last_weight_1, "\t decision_function svm_c_linear_dcd_trainer::train(x,y,state)" << "\n\t The given state object is invalid because the previous trainer was configured differently." << "\n\t have_bias_: " << have_bias_ << "\n\t have_bias: " << have_bias << "\n\t last_weight_1_: " << last_weight_1_ << "\n\t last_weight_1: " << last_weight_1 ); DLIB_CASSERT( new_dims >= dims, "\t decision_function svm_c_linear_dcd_trainer::train(x,y,state)" << "\n\t The given state object is invalid because the training data dimensions have shrunk." << "\n\t new_dims: " << new_dims << "\n\t dims: " << dims ); DLIB_CASSERT( x.size() >= static_cast<long>(alpha.size()), "\t decision_function svm_c_linear_dcd_trainer::train(x,y,state)" << "\n\t The given state object is invalid because the training data has fewer samples than previously." << "\n\t x.size(): " << x.size() << "\n\t alpha.size(): " << alpha.size() ); // make sure we amortize the cost of growing the alpha vector. if (alpha.capacity() < static_cast<unsigned long>(x.size())) alpha.reserve(x.size()*2); new_idx = alpha.size(); // Make sure alpha has the same length as x. So pad with extra zeros if // necessary to make this happen. alpha.resize(x.size(),0); if (new_dims != dims) { // The only valid way the dimensions can be different here is if // you are using a sparse vector type. This is because we might // have had training samples which just happened to not include all // the features previously. Therefore, max_index_plus_one() would // have given too low of a result. But for dense vectors it is // definitely a user error if the dimensions don't match. DLIB_CASSERT(is_matrix<sample_type>::value == false, "\t decision_function svm_c_linear_dcd_trainer::train(x,y,state)" << "\n\t The given state object is invalid because the training data dimensions have changed." << "\n\t new_dims: " << new_dims << "\n\t dims: " << dims ); // extend w by the right number of elements if (have_bias && !last_weight_1) { // Splice some zeros into the w vector so it will have the // right length. Here we are being careful to move the bias // weight to the end of the resulting vector. w = join_cols(join_cols( colm(w,0,dims), zeros_matrix<scalar_type>(new_dims-dims,1)), uniform_matrix<scalar_type>(1,1,w(dims)) ); } else { // Just concatenate the right number of zeros. w = join_cols(w, zeros_matrix<scalar_type>(new_dims-dims,1)); } dims = new_dims; } } else { did_init = true; have_bias = have_bias_; last_weight_1 = last_weight_1_; dims = new_dims; alpha.resize(x.size()); index.reserve(x.size()); Q.reserve(x.size()); if (have_bias && !last_weight_1) w.set_size(dims+1); else w.set_size(dims); w = 0; } for (long i = new_idx; i < x.size(); ++i) { Q.push_back(length_squared(x(i))); if (have_bias && !last_weight_1) { index.push_back(i); Q.back() += 1; } else if (Q.back() != 0) { index.push_back(i); } if (do_svm_l2_) { if (y(i) > 0) Q.back() += 1/(2*Cpos); else Q.back() += 1/(2*Cneg); } } if (last_weight_1) w(dims-1) = 1; } template <typename T> typename enable_if<is_matrix<T>,scalar_type>::type length_squared (const T& x) const { if (!last_weight_1) { return dlib::dot(x,x); } else { // skip the last dimension return dlib::dot(colm(x,0,x.size()-1), colm(x,0,x.size()-1)); } } template <typename T> typename disable_if<is_matrix<T>,scalar_type>::type length_squared (const T& x) const { if (!last_weight_1) { return dlib::dot(x,x); } else { scalar_type temp = 0; typename T::const_iterator i; for (i = x.begin(); i != x.end(); ++i) { // skip the last dimension if (static_cast<long>(i->first) < dims-1) temp += i->second*i->second; } return temp; } } bool did_init; bool have_bias; bool last_weight_1; std::vector<scalar_type> alpha; scalar_vector_type w; std::vector<scalar_type> Q; std::vector<long> index; long dims; dlib::rand rnd; public: const std::vector<scalar_type>& get_alpha () const { return alpha; } friend void serialize(const optimizer_state& item, std::ostream& out) { const int version = 1; dlib::serialize(version, out); dlib::serialize(item.did_init, out); dlib::serialize(item.have_bias, out); dlib::serialize(item.last_weight_1, out); dlib::serialize(item.alpha, out); dlib::serialize(item.w, out); dlib::serialize(item.Q, out); dlib::serialize(item.index, out); dlib::serialize(item.dims, out); dlib::serialize(item.rnd, out); } friend void deserialize(optimizer_state& item, std::istream& in) { int version = 0; dlib::deserialize(version, in); if (version != 1) { throw dlib::serialization_error( "Error while deserializing dlib::svm_c_linear_dcd_trainer::optimizer_state, unexpected version." ); } dlib::deserialize(item.did_init, in); dlib::deserialize(item.have_bias, in); dlib::deserialize(item.last_weight_1, in); dlib::deserialize(item.alpha, in); dlib::deserialize(item.w, in); dlib::deserialize(item.Q, in); dlib::deserialize(item.index, in); dlib::deserialize(item.dims, in); dlib::deserialize(item.rnd, in); } }; template < typename in_sample_vector_type, typename in_scalar_vector_type > const decision_function<kernel_type> train ( const in_sample_vector_type& x, const in_scalar_vector_type& y ) const { optimizer_state state; return do_train(mat(x), mat(y), state); } template < typename in_sample_vector_type, typename in_scalar_vector_type > const decision_function<kernel_type> train ( const in_sample_vector_type& x, const in_scalar_vector_type& y, optimizer_state& state ) const { return do_train(mat(x), mat(y), state); } private: // ------------------------------------------------------------------------------------ template < typename in_sample_vector_type, typename in_scalar_vector_type > const decision_function<kernel_type> do_train ( const in_sample_vector_type& x, const in_scalar_vector_type& y, optimizer_state& state ) const { // make sure requires clause is not broken DLIB_ASSERT(is_learning_problem(x,y) == true, "\t decision_function svm_c_linear_dcd_trainer::train(x,y)" << "\n\t invalid inputs were given to this function" << "\n\t x.size(): " << x.size() << "\n\t y.size(): " << y.size() << "\n\t is_learning_problem(x,y): " << is_learning_problem(x,y) ); #ifdef ENABLE_ASSERTS for (long i = 0; i < x.size(); ++i) { DLIB_ASSERT(y(i) == +1 || y(i) == -1, "\t decision_function svm_c_linear_dcd_trainer::train(x,y)" << "\n\t invalid inputs were given to this function" << "\n\t y("<<i<<"): " << y(i) ); } #endif state.init(x,y,have_bias,last_weight_1,do_svm_l2,Cpos,Cneg); std::vector<scalar_type>& alpha = state.alpha; scalar_vector_type& w = state.w; std::vector<long>& index = state.index; const long dims = state.dims; unsigned long active_size = index.size(); scalar_type PG_max_prev = std::numeric_limits<scalar_type>::infinity(); scalar_type PG_min_prev = -std::numeric_limits<scalar_type>::infinity(); const scalar_type Dii_pos = 1/(2*Cpos); const scalar_type Dii_neg = 1/(2*Cneg); // main loop for (unsigned long iter = 0; iter < max_iterations; ++iter) { scalar_type PG_max = -std::numeric_limits<scalar_type>::infinity(); scalar_type PG_min = std::numeric_limits<scalar_type>::infinity(); // randomly shuffle the indices for (unsigned long i = 0; i < active_size; ++i) { // pick a random index >= i const long j = i + state.rnd.get_random_32bit_number()%(active_size-i); std::swap(index[i], index[j]); } // for all the active training samples for (unsigned long ii = 0; ii < active_size; ++ii) { const long i = index[ii]; scalar_type G = y(i)*dot(w, x(i)) - 1; if (do_svm_l2) { if (y(i) > 0) G += Dii_pos*alpha[i]; else G += Dii_neg*alpha[i]; } const scalar_type C = (y(i) > 0) ? Cpos : Cneg; const scalar_type U = do_svm_l2 ? std::numeric_limits<scalar_type>::infinity() : C; scalar_type PG = 0; if (alpha[i] == 0) { if (G > PG_max_prev) { // shrink the active set of training examples --active_size; std::swap(index[ii], index[active_size]); --ii; continue; } if (G < 0) PG = G; } else if (alpha[i] == U) { if (G < PG_min_prev) { // shrink the active set of training examples --active_size; std::swap(index[ii], index[active_size]); --ii; continue; } if (G > 0) PG = G; } else { PG = G; } if (PG > PG_max) PG_max = PG; if (PG < PG_min) PG_min = PG; // if PG != 0 if (std::abs(PG) > 1e-12) { const scalar_type alpha_old = alpha[i]; alpha[i] = std::min(std::max(alpha[i] - G/state.Q[i], (scalar_type)0.0), U); const scalar_type delta = (alpha[i]-alpha_old)*y(i); add_to(w, x(i), delta); if (have_bias && !last_weight_1) w(w.size()-1) -= delta; if (last_weight_1) w(dims-1) = 1; } } if (verbose) { using namespace std; cout << "gap: " << PG_max - PG_min << endl; cout << "active_size: " << active_size << endl; cout << "iter: " << iter << endl; cout << endl; } if (PG_max - PG_min <= eps) { // stop if we are within eps tolerance and the last iteration // was over all the samples if (active_size == index.size()) break; // Turn off shrinking on the next iteration. We will stop if the // tolerance is still <= eps when shrinking is off. active_size = index.size(); PG_max_prev = std::numeric_limits<scalar_type>::infinity(); PG_min_prev = -std::numeric_limits<scalar_type>::infinity(); } else if (do_shrinking) { PG_max_prev = PG_max; PG_min_prev = PG_min; if (PG_max_prev <= 0) PG_max_prev = std::numeric_limits<scalar_type>::infinity(); if (PG_min_prev >= 0) PG_min_prev = -std::numeric_limits<scalar_type>::infinity(); } } // end of main optimization loop // put the solution into a decision function and then return it decision_function<kernel_type> df; if (have_bias && !last_weight_1) df.b = w(w.size()-1); else df.b = 0; df.basis_vectors.set_size(1); // Copy the plane normal into the output basis vector. The output vector might // be a sparse vector container so we need to use this special kind of copy to // handle that case. assign(df.basis_vectors(0), colm(w, 0, dims)); df.alpha.set_size(1); df.alpha(0) = 1; return df; } scalar_type dot ( const scalar_vector_type& w, const sample_type& sample ) const { if (have_bias && !last_weight_1) { const long w_size_m1 = w.size()-1; return dlib::dot(colm(w,0,w_size_m1), sample) - w(w_size_m1); } else { return dlib::dot(w, sample); } } // ------------------------------------------------------------------------------------ scalar_type Cpos; scalar_type Cneg; scalar_type eps; unsigned long max_iterations; bool verbose; bool have_bias; // having a bias means we pretend all x vectors have an extra element which is always -1. bool last_weight_1; bool do_shrinking; bool do_svm_l2; }; // end of class svm_c_linear_dcd_trainer // ---------------------------------------------------------------------------------------- } #endif // DLIB_SVm_C_LINEAR_DCD_TRAINER_Hh_