// Copyright (C) 2010  Davis E. King (davis@dlib.net)
// License: Boost Software License   See LICENSE.txt for the full license.
#ifndef DLIB_OPTIMIZATIoN_OCA_Hh_
#define DLIB_OPTIMIZATIoN_OCA_Hh_

#include "optimization_oca_abstract.h"

#include "../matrix.h"
#include "optimization_solve_qp_using_smo.h"
#include <vector>
#include "../sequence.h"

// ----------------------------------------------------------------------------------------

namespace dlib
{
    template <typename matrix_type>
    class oca_problem
    {
    public:
        typedef typename matrix_type::type scalar_type;

        virtual ~oca_problem() {}

        virtual bool risk_has_lower_bound (
            scalar_type& 
        ) const { return false; }

        virtual bool optimization_status (
            scalar_type ,
            scalar_type ,
            scalar_type ,
            scalar_type ,
            unsigned long,
            unsigned long
        ) const = 0;

        virtual scalar_type get_c (
        ) const = 0;

        virtual long get_num_dimensions (
        ) const = 0;

        virtual void get_risk (
            matrix_type& current_solution,
            scalar_type& risk_value,
            matrix_type& risk_subgradient
        ) const = 0;

    };

// ----------------------------------------------------------------------------------------

    class oca
    {
    public:

        oca () 
        {
            sub_eps = 1e-2;
            sub_max_iter = 50000;

            inactive_thresh = 20;
        }

        void set_subproblem_epsilon (
            double eps_
        ) { sub_eps = eps_; }

        double get_subproblem_epsilon (
        ) const { return sub_eps; }

        void set_subproblem_max_iterations (
            unsigned long sub_max_iter_
        ) 
        { 
            // make sure requires clause is not broken
            DLIB_ASSERT(sub_max_iter_ > 0,
                "\t void oca::set_subproblem_max_iterations"
                << "\n\t max iterations must be greater than 0"
                << "\n\t sub_max_iter_: " << sub_max_iter_
                << "\n\t this: " << this
                );

            sub_max_iter = sub_max_iter_; 
        }

        unsigned long get_subproblem_max_iterations (
        ) const { return sub_max_iter; }

        void set_inactive_plane_threshold (
            unsigned long inactive_thresh_
        ) 
        { 
            // make sure requires clause is not broken
            DLIB_ASSERT(inactive_thresh_ > 0,
                "\t void oca::set_inactive_plane_threshold"
                << "\n\t inactive threshold must be greater than 0"
                << "\n\t inactive_thresh_: " << inactive_thresh_
                << "\n\t this: " << this
                );

            inactive_thresh = inactive_thresh_; 
        }

        unsigned long get_inactive_plane_threshold (
        ) const { return inactive_thresh; }

        template <
            typename matrix_type
            >
        typename matrix_type::type operator() (
            const oca_problem<matrix_type>& problem,
            matrix_type& w,
            unsigned long num_nonnegative = 0,
            unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max()
        ) const
        {
            matrix_type empty_prior;
            return oca_impl(problem, w, empty_prior, false, num_nonnegative, force_weight_to_1, 0);
        }

        template <
            typename matrix_type
            >
        typename matrix_type::type solve_with_elastic_net (
            const oca_problem<matrix_type>& problem,
            matrix_type& w,
            double lasso_lambda,
            unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max()
        ) const
        {
            matrix_type empty_prior;
            return oca_impl(problem, w, empty_prior, false, 0, force_weight_to_1, lasso_lambda);
        }

        template <
            typename matrix_type
            >
        typename matrix_type::type operator() (
            const oca_problem<matrix_type>& problem,
            matrix_type& w,
            const matrix_type& prior
        ) const
        {
            // make sure requires clause is not broken
            DLIB_ASSERT(is_col_vector(prior) && prior.size() == problem.get_num_dimensions(),
                "\t scalar_type oca::operator()"
                << "\n\t The prior vector does not have the correct dimensions."
                << "\n\t is_col_vector(prior):         " << is_col_vector(prior) 
                << "\n\t prior.size():                 " << prior.size() 
                << "\n\t problem.get_num_dimensions(): " << problem.get_num_dimensions() 
                << "\n\t this:                         " << this
                );
            // disable the force weight to 1 option for this mode.  We also disable the
            // non-negative constraints.
            unsigned long force_weight_to_1 = std::numeric_limits<unsigned long>::max();
            return oca_impl(problem, w, prior, true, 0, force_weight_to_1, 0);
        }

    private:

        template <
            typename matrix_type
            >
        typename matrix_type::type oca_impl (
            const oca_problem<matrix_type>& problem,
            matrix_type& w,
            const matrix_type& prior,
            bool have_prior,
            unsigned long num_nonnegative,
            unsigned long force_weight_to_1,
            const double lasso_lambda
        ) const
        {
            const unsigned long num_dims = problem.get_num_dimensions();

            // make sure requires clause is not broken
            DLIB_ASSERT(problem.get_c() > 0 &&
                        problem.get_num_dimensions() > 0 && 
                        0 <= lasso_lambda && lasso_lambda < 1,
                "\t scalar_type oca::operator()"
                << "\n\t The oca_problem is invalid"
                << "\n\t problem.get_c():              " << problem.get_c() 
                << "\n\t problem.get_num_dimensions(): " << num_dims 
                << "\n\t lasso_lambda:                 " << lasso_lambda 
                << "\n\t this: " << this
                );
            if (have_prior)
            {
                DLIB_ASSERT(lasso_lambda == 0, "Solver doesn't support using a prior with lasso.");
                DLIB_ASSERT(num_nonnegative == 0, "Solver doesn't support using a prior with non-negative constraints.");
            }
            else if (lasso_lambda != 0)
            {
                DLIB_ASSERT(num_nonnegative == 0, "Solver doesn't support using lasso with non-negative constraints.");
            }

            const double ridge_lambda = 1-lasso_lambda;

            if (num_nonnegative > num_dims)
                num_nonnegative = num_dims;

            typedef typename matrix_type::type scalar_type;
            typedef typename matrix_type::layout_type layout_type;
            typedef typename matrix_type::mem_manager_type mem_manager_type;
            typedef matrix_type vect_type;

            const scalar_type C = problem.get_c();

            typename sequence<vect_type>::kernel_2a planes;
            std::vector<scalar_type> bs, miss_count;

            vect_type new_plane, alpha, btemp;

            w.set_size(num_dims, 1);
            w = 0;

            // The current objective value.  Note also that w always contains 
            // the current solution.
            scalar_type cur_obj = std::numeric_limits<scalar_type>::max();

            // This will hold the cutting plane objective value.  This value is
            // a lower bound on the true optimal objective value.
            scalar_type cp_obj = 0;

            matrix<scalar_type,0,0,mem_manager_type, layout_type> K, Ktmp;
            matrix<scalar_type,0,1,mem_manager_type, layout_type> lambda, d;
            if (lasso_lambda != 0)
                d.set_size(num_dims);
            else
                d.set_size(num_nonnegative);
            d = lasso_lambda*ones_matrix(d);

            scalar_type R_lower_bound;
            if (problem.risk_has_lower_bound(R_lower_bound))
            {
                // The flat lower bounding plane is always good to have if we know
                // what it is.
                bs.push_back(R_lower_bound);
                new_plane = zeros_matrix(w);
                planes.add(0, new_plane);
                alpha = uniform_matrix<scalar_type>(1,1, C);
                miss_count.push_back(0);

                K.set_size(1,1);
                K(0,0) = 0;
            }

            const double prior_norm = have_prior ?  0.5*dot(prior,prior) : 0;

            unsigned long counter = 0;
            while (true)
            {

                // add the next cutting plane
                scalar_type cur_risk;
                if (force_weight_to_1 < (unsigned long)w.size())
                    w(force_weight_to_1) = 1;

                problem.get_risk(w, cur_risk, new_plane);

                if (force_weight_to_1 < (unsigned long)w.size())
                {
                    // We basically arrange for the w(force_weight_to_1) element and all
                    // subsequent elements of w to not be involved in the optimization at
                    // all.  An easy way to do this is to just make sure the elements of w
                    // corresponding elements in the subgradient are always set to zero
                    // while we run the cutting plane algorithm.  The only time
                    // w(force_weight_to_1) is 1 is when we pass it to the oca_problem.
                    set_rowm(w, range(force_weight_to_1, w.size()-1)) = 0;
                    set_rowm(new_plane, range(force_weight_to_1, new_plane.size()-1)) = 0;
                }

                if (have_prior)
                    bs.push_back(cur_risk - dot(w,new_plane) + dot(prior,new_plane));
                else
                    bs.push_back(cur_risk - dot(w,new_plane));
                planes.add(planes.size(), new_plane);
                miss_count.push_back(0);

                // If alpha is empty then initialize it (we must always have sum(alpha) == C).  
                // But otherwise, just append a zero.
                if (alpha.size() == 0)
                    alpha = uniform_matrix<scalar_type>(1,1, C);
                else
                    alpha = join_cols(alpha,zeros_matrix<scalar_type>(1,1));

                const scalar_type wnorm = 0.5*ridge_lambda*trans(w)*w + lasso_lambda*sum(abs(w));
                const double prior_part = have_prior? dot(w,prior) : 0;
                cur_obj = wnorm + C*cur_risk + prior_norm-prior_part;

                // report current status
                const scalar_type risk_gap = cur_risk - (cp_obj-wnorm+prior_part-prior_norm)/C;
                if (counter > 0 && problem.optimization_status(cur_obj, cur_obj - cp_obj, 
                                                               cur_risk, risk_gap, planes.size(), counter))
                {
                    break;
                }

                // compute kernel matrix for all the planes
                K.swap(Ktmp);
                K.set_size(planes.size(), planes.size());
                // copy over the old K matrix
                set_subm(K, 0,0, Ktmp.nr(), Ktmp.nc()) = Ktmp;

                // now add the new row and column to K
                for (unsigned long c = 0; c < planes.size(); ++c)
                {
                    K(c, Ktmp.nc()) = dot(planes[c], planes[planes.size()-1]);
                    K(Ktmp.nc(), c) = K(c,Ktmp.nc());
                }


                // solve the cutting plane subproblem for the next w.   We solve it to an
                // accuracy that is related to how big the error gap is.  Also, we multiply
                // by ridge_lambda because the objective function for the QP we solve was
                // implicitly scaled by ridge_lambda.  That is, we want to ask the QP
                // solver to solve the problem until the duality gap is 0.1 times smaller
                // than what it is now.  So the factor of ridge_lambda is necessary to make
                // this happen. 
                scalar_type eps = std::min<scalar_type>(sub_eps, 0.1*ridge_lambda*(cur_obj-cp_obj));
                // just a sanity check
                if (eps < 1e-16)
                    eps = 1e-16;
                // Note that we warm start this optimization by using the alpha from the last
                // iteration as the starting point.
                if (lasso_lambda != 0)
                {
                    // copy planes into a matrix so we can call solve_qp4_using_smo()
                    matrix<scalar_type,0,0,mem_manager_type, layout_type> planes_mat(num_dims,planes.size());
                    for (unsigned long i = 0; i < planes.size(); ++i)
                        set_colm(planes_mat,i) = planes[i];

                    btemp = ridge_lambda*mat(bs) - trans(planes_mat)*d;
                    solve_qp4_using_smo(planes_mat, K, btemp, d, alpha, lambda, eps, sub_max_iter, (scalar_type)(2*lasso_lambda)); 
                }
                else if (num_nonnegative != 0)
                {
                    // copy planes into a matrix so we can call solve_qp4_using_smo()
                    matrix<scalar_type,0,0,mem_manager_type, layout_type> planes_mat(num_nonnegative,planes.size());
                    for (unsigned long i = 0; i < planes.size(); ++i)
                        set_colm(planes_mat,i) = colm(planes[i],0,num_nonnegative);

                    solve_qp4_using_smo(planes_mat, K, mat(bs), d, alpha, lambda, eps, sub_max_iter); 
                }
                else
                {
                    solve_qp_using_smo(K, mat(bs), alpha, eps, sub_max_iter); 
                }

                // construct the w that minimized the subproblem.
                w = -alpha(0)*planes[0];
                for (unsigned long i = 1; i < planes.size(); ++i)
                    w -= alpha(i)*planes[i];
                if (lasso_lambda != 0)
                    w = (lambda-d+w)/ridge_lambda;
                else if (num_nonnegative != 0) // threshold the first num_nonnegative w elements if necessary.
                    set_rowm(w,range(0,num_nonnegative-1)) = lowerbound(rowm(w,range(0,num_nonnegative-1)),0);

                for (long i = 0; i < alpha.size(); ++i)
                {
                    if (alpha(i) != 0)
                        miss_count[i] = 0;
                    else
                        miss_count[i] += 1;
                }

                // Compute the lower bound on the true objective given to us by the cutting 
                // plane subproblem.
                cp_obj = -0.5*ridge_lambda*trans(w)*w + trans(alpha)*mat(bs);
                if (have_prior)
                    w += prior;

                // If it has been a while since a cutting plane was an active constraint then
                // we should throw it away.
                while (max(mat(miss_count)) >= inactive_thresh)
                {
                    const long idx = index_of_max(mat(miss_count));
                    bs.erase(bs.begin()+idx);
                    miss_count.erase(miss_count.begin()+idx);
                    K = removerc(K, idx, idx);
                    alpha = remove_row(alpha,idx);
                    planes.remove(idx, new_plane);
                }

                ++counter;
            }

            if (force_weight_to_1 < (unsigned long)w.size())
                w(force_weight_to_1) = 1;

            return cur_obj;
        }

        double sub_eps;

        unsigned long sub_max_iter;

        unsigned long inactive_thresh;
    };
}

// ----------------------------------------------------------------------------------------

#endif // DLIB_OPTIMIZATIoN_OCA_Hh_