// Copyright (C) 2008 Davis E. King (davis@dlib.net) // License: Boost Software License See LICENSE.txt for the full license. #ifndef DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_H_ #define DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_H_ #include <cmath> #include <limits> #include "../matrix.h" #include "../algs.h" #include "optimization_search_strategies_abstract.h" #include "../sequence.h" namespace dlib { // ---------------------------------------------------------------------------------------- class cg_search_strategy { public: cg_search_strategy() : been_used(false) {} double get_wolfe_rho ( ) const { return 0.001; } double get_wolfe_sigma ( ) const { return 0.01; } unsigned long get_max_line_search_iterations ( ) const { return 100; } template <typename T> const matrix<double,0,1>& get_next_direction ( const T& , const double , const T& funct_derivative ) { if (been_used == false) { been_used = true; prev_direction = -funct_derivative; } else { // Use the Polak-Ribiere (4.1.12) conjugate gradient described by Fletcher on page 83 const double temp = trans(prev_derivative)*prev_derivative; // If this value hits zero then just use the direction of steepest descent. if (std::abs(temp) < std::numeric_limits<double>::epsilon()) { prev_derivative = funct_derivative; prev_direction = -funct_derivative; return prev_direction; } double b = trans(funct_derivative-prev_derivative)*funct_derivative/(temp); prev_direction = -funct_derivative + b*prev_direction; } prev_derivative = funct_derivative; return prev_direction; } private: bool been_used; matrix<double,0,1> prev_derivative; matrix<double,0,1> prev_direction; }; // ---------------------------------------------------------------------------------------- class bfgs_search_strategy { public: bfgs_search_strategy() : been_used(false), been_used_twice(false) {} double get_wolfe_rho ( ) const { return 0.01; } double get_wolfe_sigma ( ) const { return 0.9; } unsigned long get_max_line_search_iterations ( ) const { return 100; } template <typename T> const matrix<double,0,1>& get_next_direction ( const T& x, const double , const T& funct_derivative ) { if (been_used == false) { been_used = true; H = identity_matrix<double>(x.size()); } else { // update H with the BFGS formula from (3.2.12) on page 55 of Fletcher delta = (x-prev_x); gamma = funct_derivative-prev_derivative; double dg = dot(delta,gamma); // Try to set the initial value of the H matrix to something reasonable if we are still // in the early stages of figuring out what it is. This formula below is what is suggested // in the book Numerical Optimization by Nocedal and Wright in the chapter on Quasi-Newton methods. if (been_used_twice == false) { double gg = trans(gamma)*gamma; if (std::abs(gg) > std::numeric_limits<double>::epsilon()) { const double temp = put_in_range(0.01, 100, dg/gg); H = diagm(uniform_matrix<double>(x.size(),1, temp)); been_used_twice = true; } } Hg = H*gamma; gH = trans(trans(gamma)*H); double gHg = trans(gamma)*H*gamma; if (gHg < std::numeric_limits<double>::infinity() && dg < std::numeric_limits<double>::infinity() && dg != 0) { H += (1 + gHg/dg)*delta*trans(delta)/(dg) - (delta*trans(gH) + Hg*trans(delta))/(dg); } else { H = identity_matrix<double>(H.nr()); been_used_twice = false; } } prev_x = x; prev_direction = -H*funct_derivative; prev_derivative = funct_derivative; return prev_direction; } private: bool been_used; bool been_used_twice; matrix<double,0,1> prev_x; matrix<double,0,1> prev_derivative; matrix<double,0,1> prev_direction; matrix<double> H; matrix<double,0,1> delta, gamma, Hg, gH; }; // ---------------------------------------------------------------------------------------- class lbfgs_search_strategy { public: explicit lbfgs_search_strategy(unsigned long max_size_) : max_size(max_size_), been_used(false) { DLIB_ASSERT ( max_size > 0, "\t lbfgs_search_strategy(max_size)" << "\n\t max_size can't be zero" ); } lbfgs_search_strategy(const lbfgs_search_strategy& item) { max_size = item.max_size; been_used = item.been_used; prev_x = item.prev_x; prev_derivative = item.prev_derivative; prev_direction = item.prev_direction; alpha = item.alpha; dh_temp = item.dh_temp; } double get_wolfe_rho ( ) const { return 0.01; } double get_wolfe_sigma ( ) const { return 0.9; } unsigned long get_max_line_search_iterations ( ) const { return 100; } template <typename T> const matrix<double,0,1>& get_next_direction ( const T& x, const double , const T& funct_derivative ) { prev_direction = -funct_derivative; if (been_used == false) { been_used = true; } else { // add an element into the stored data sequence dh_temp.s = x - prev_x; dh_temp.y = funct_derivative - prev_derivative; double temp = dot(dh_temp.s, dh_temp.y); // only accept this bit of data if temp isn't zero if (std::abs(temp) > std::numeric_limits<double>::epsilon()) { dh_temp.rho = 1/temp; data.add(data.size(), dh_temp); } else { data.clear(); } if (data.size() > 0) { // This block of code is from algorithm 7.4 in the Nocedal book. alpha.resize(data.size()); for (unsigned long i = data.size()-1; i < data.size(); --i) { alpha[i] = data[i].rho*dot(data[i].s, prev_direction); prev_direction -= alpha[i]*data[i].y; } // Take a guess at what the first H matrix should be. This formula below is what is suggested // in the book Numerical Optimization by Nocedal and Wright in the chapter on Large Scale // Unconstrained Optimization (in the L-BFGS section). double H_0 = 1.0/data[data.size()-1].rho/dot(data[data.size()-1].y, data[data.size()-1].y); H_0 = put_in_range(0.001, 1000.0, H_0); prev_direction *= H_0; for (unsigned long i = 0; i < data.size(); ++i) { double beta = data[i].rho*dot(data[i].y, prev_direction); prev_direction += data[i].s * (alpha[i] - beta); } } } if (data.size() > max_size) { // remove the oldest element in the data sequence data.remove(0, dh_temp); } prev_x = x; prev_derivative = funct_derivative; return prev_direction; } private: struct data_helper { matrix<double,0,1> s; matrix<double,0,1> y; double rho; friend void swap(data_helper& a, data_helper& b) { a.s.swap(b.s); a.y.swap(b.y); std::swap(a.rho, b.rho); } }; sequence<data_helper>::kernel_2a data; unsigned long max_size; bool been_used; matrix<double,0,1> prev_x; matrix<double,0,1> prev_derivative; matrix<double,0,1> prev_direction; std::vector<double> alpha; data_helper dh_temp; }; // ---------------------------------------------------------------------------------------- template <typename hessian_funct> class newton_search_strategy_obj { public: explicit newton_search_strategy_obj( const hessian_funct& hess ) : hessian(hess) {} double get_wolfe_rho ( ) const { return 0.01; } double get_wolfe_sigma ( ) const { return 0.9; } unsigned long get_max_line_search_iterations ( ) const { return 100; } template <typename T> const matrix<double,0,1> get_next_direction ( const T& x, const double , const T& funct_derivative ) { return -inv(hessian(x))*funct_derivative; } private: hessian_funct hessian; }; template <typename hessian_funct> newton_search_strategy_obj<hessian_funct> newton_search_strategy ( hessian_funct hessian ) { return newton_search_strategy_obj<hessian_funct>(hessian); } // ---------------------------------------------------------------------------------------- } #endif // DLIB_OPTIMIZATIOn_SEARCH_STRATEGIES_H_