/* * Online training with L2-regularized Stochastic Gradient Descent (SGD). * * Copyright (c) 2007-2010, Naoaki Okazaki * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * * Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * Neither the names of the authors nor the names of its contributors * may be used to endorse or promote products derived from this * software without specific prior written permission. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER * OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ /* $Id$ */ /* SGD for L2-regularized MAP estimation. The iterative algorithm is inspired by Pegasos: Shai Shalev-Shwartz, Yoram Singer, and Nathan Srebro. Pegasos: Primal Estimated sub-GrAdient SOlver for SVM. In Proc. of ICML 2007, pp 807-814, 2007. The calibration strategy is inspired by the implementation of sgd: http://leon.bottou.org/projects/sgd written by Léon Bottou. The objective function to minimize is: f(w) = (lambda/2) * ||w||^2 + (1/N) * \sum_i^N log P^i(y|x) lambda = 2 * C / N The original version of the Pegasos algorithm. 0) Initialization t = t0 k = [the batch size] 1) Computing the learning rate (eta). eta = 1 / (lambda * t) 2) Updating feature weights. w = (1 - eta * lambda) w - (eta / k) \sum_i (oexp - mexp) 3) Projecting feature weights within an L2-ball. w = min{1, (1/sqrt(lambda))/||w||} * w 4) Goto 1 until convergence. This implementation omit the step 3) because it makes the source code tricky (in order to maintain L2-norm of feature weights at any time) and because the project step does not have a strong impact to the quality of solution. A naive implementation requires O(K) computations for steps 2, where K is the total number of features. This code implements the procedure in an efficient way: 0) Initialization decay = 1 1) Computing various factors eta = 1 / (lambda * t) decay *= (1 - eta * lambda) gain = (eta / k) / decay 2) Updating feature weights Updating feature weights from observation expectation: delta = gain * (1.0) * f(x,y) w += delta Updating feature weights from model expectation: delta = gain * (-P(y|x)) * f(x,y) w += delta 4) Goto 1 until convergence. */ #ifdef HAVE_CONFIG_H #include #endif/*HAVE_CONFIG_H*/ #include #include #include #include #include #include #include #include #include "crfsuite_internal.h" #include "logging.h" #include "params.h" #include "crf1d.h" #include "vecmath.h" #define MIN(a, b) ((a) < (b) ? (a) : (b)) typedef struct { floatval_t c2; floatval_t lambda; floatval_t t0; int max_iterations; int period; floatval_t delta; floatval_t calibration_eta; floatval_t calibration_rate; int calibration_samples; int calibration_candidates; int calibration_max_trials; } training_option_t; static int l2sgd( encoder_t *gm, dataset_t *trainset, dataset_t *testset, floatval_t *w, logging_t *lg, const int N, const floatval_t t0, const floatval_t lambda, const int num_epochs, int calibration, int period, const floatval_t epsilon, floatval_t *ptr_loss ) { int i, epoch, ret = 0; floatval_t t = 0; floatval_t loss = 0, sum_loss = 0; floatval_t best_sum_loss = DBL_MAX; floatval_t eta, gain, decay = 1.; floatval_t improvement = 0.; floatval_t norm2 = 0.; floatval_t *pf = NULL; floatval_t *best_w = NULL; clock_t clk_prev, clk_begin = clock(); const int K = gm->num_features; if (!calibration) { pf = (floatval_t*)malloc(sizeof(floatval_t) * period); best_w = (floatval_t*)calloc(K, sizeof(floatval_t)); if (pf == NULL || best_w == NULL) { ret = CRFSUITEERR_OUTOFMEMORY; goto error_exit; } } /* Initialize the feature weights. */ vecset(w, 0, K); /* Loop for epochs. */ for (epoch = 1;epoch <= num_epochs;++epoch) { clk_prev = clock(); if (!calibration) { logging(lg, "***** Epoch #%d *****\n", epoch); /* Shuffle the training instances. */ dataset_shuffle(trainset); } /* Loop for instances. */ sum_loss = 0.; for (i = 0;i < N;++i) { const crfsuite_instance_t *inst = dataset_get(trainset, i); /* Update various factors. */ eta = 1 / (lambda * (t0 + t)); decay *= (1.0 - eta * lambda); gain = eta / decay; /* Compute the loss and gradients for the instance. */ gm->set_weights(gm, w, decay); gm->set_instance(gm, inst); gm->objective_and_gradients(gm, &loss, w, gain); sum_loss += loss; ++t; } /* Terminate when the loss is abnormal (NaN, -Inf, +Inf). */ if (!isfinite(loss)) { logging(lg, "ERROR: overflow loss\n"); ret = CRFSUITEERR_OVERFLOW; sum_loss = loss; goto error_exit; } /* Scale the feature weights. */ vecscale(w, decay, K); decay = 1.; /* Include the L2 norm of feature weights to the objective. */ /* The factor N is necessary because lambda = 2 * C / N. */ norm2 = vecdot(w, w, K); sum_loss += 0.5 * lambda * norm2 * N; /* One epoch finished. */ if (!calibration) { /* Check if the current epoch is the best. */ if (sum_loss < best_sum_loss) { /* Store the feature weights to best_w. */ best_sum_loss = sum_loss; veccopy(best_w, w, K); } /* We don't test the stopping criterion while period < epoch. */ if (period < epoch) { improvement = (pf[(epoch-1) % period] - sum_loss) / sum_loss; } else { improvement = epsilon; } /* Store the current value of the objective function. */ pf[(epoch-1) % period] = sum_loss; logging(lg, "Loss: %f\n", sum_loss); if (period < epoch) { logging(lg, "Improvement ratio: %f\n", improvement); } logging(lg, "Feature L2-norm: %f\n", sqrt(norm2)); logging(lg, "Learning rate (eta): %f\n", eta); logging(lg, "Total number of feature updates: %.0f\n", t); logging(lg, "Seconds required for this iteration: %.3f\n", (clock() - clk_prev) / (double)CLOCKS_PER_SEC); /* Holdout evaluation if necessary. */ if (testset != NULL) { holdout_evaluation(gm, testset, w, lg); } logging(lg, "\n"); /* Check for the stopping criterion. */ if (improvement < epsilon) { ret = 0; break; } } } /* Output the optimization result. */ if (!calibration) { if (ret == 0) { if (epoch < num_epochs) { logging(lg, "SGD terminated with the stopping criteria\n"); } else { logging(lg, "SGD terminated with the maximum number of iterations\n"); } } else { logging(lg, "SGD terminated with error code (%d)\n", ret); } } /* Restore the best weights. */ if (best_w != NULL) { sum_loss = best_sum_loss; veccopy(w, best_w, K); } error_exit: free(best_w); free(pf); if (ptr_loss != NULL) { *ptr_loss = sum_loss; } return ret; } static floatval_t l2sgd_calibration( encoder_t *gm, dataset_t *ds, floatval_t *w, logging_t *lg, const training_option_t* opt ) { int i, s; int dec = 0, ok, trials = 1; int num = opt->calibration_candidates; clock_t clk_begin = clock(); floatval_t loss = 0.; floatval_t init_loss = 0.; floatval_t best_loss = DBL_MAX; floatval_t eta = opt->calibration_eta; floatval_t best_eta = opt->calibration_eta; const int N = ds->num_instances; const int S = MIN(N, opt->calibration_samples); const int K = gm->num_features; const floatval_t init_eta = opt->calibration_eta; const floatval_t rate = opt->calibration_rate; const floatval_t lambda = opt->lambda; logging(lg, "Calibrating the learning rate (eta)\n"); logging(lg, "calibration.eta: %f\n", eta); logging(lg, "calibration.rate: %f\n", rate); logging(lg, "calibration.samples: %d\n", S); logging(lg, "calibration.candidates: %d\n", num); logging(lg, "calibration.max_trials: %d\n", opt->calibration_max_trials); /* Initialize a permutation that shuffles the instances. */ dataset_shuffle(ds); /* Initialize feature weights as zero. */ vecset(w, 0, K); /* Compute the initial loss. */ gm->set_weights(gm, w, 1.); init_loss = 0; for (i = 0;i < S;++i) { floatval_t score; const crfsuite_instance_t *inst = dataset_get(ds, i); gm->set_instance(gm, inst); gm->score(gm, inst->labels, &score); init_loss -= score; gm->partition_factor(gm, &score); init_loss += score; } init_loss += 0.5 * lambda * vecdot(w, w, K) * N; logging(lg, "Initial loss: %f\n", init_loss); while (num > 0 || !dec) { logging(lg, "Trial #%d (eta = %f): ", trials, eta); /* Perform SGD for one epoch. */ l2sgd( gm, ds, NULL, w, lg, S, 1.0 / (lambda * eta), lambda, 1, 1, 1, 0., &loss); /* Make sure that the learning rate decreases the log-likelihood. */ ok = isfinite(loss) && (loss < init_loss); if (ok) { logging(lg, "%f\n", loss); --num; } else { logging(lg, "%f (worse)\n", loss); } if (isfinite(loss) && loss < best_loss) { best_loss = loss; best_eta = eta; } if (!dec) { if (ok && 0 < num) { eta *= rate; } else { dec = 1; num = opt->calibration_candidates; eta = init_eta / rate; } } else { eta /= rate; } ++trials; if (opt->calibration_max_trials <= trials) { break; } } eta = best_eta; logging(lg, "Best learning rate (eta): %f\n", eta); logging(lg, "Seconds required: %.3f\n", (clock() - clk_begin) / (double)CLOCKS_PER_SEC); logging(lg, "\n"); return 1.0 / (lambda * eta); } int exchange_options(crfsuite_params_t* params, training_option_t* opt, int mode) { BEGIN_PARAM_MAP(params, mode) DDX_PARAM_FLOAT( "c2", opt->c2, 1., "Coefficient for L2 regularization." ) DDX_PARAM_INT( "max_iterations", opt->max_iterations, 1000, "The maximum number of iterations (epochs) for SGD optimization." ) DDX_PARAM_INT( "period", opt->period, 10, "The duration of iterations to test the stopping criterion." ) DDX_PARAM_FLOAT( "delta", opt->delta, 1e-6, "The threshold for the stopping criterion; an optimization process stops when\n" "the improvement of the log likelihood over the last ${period} iterations is no\n" "greater than this threshold." ) DDX_PARAM_FLOAT( "calibration.eta", opt->calibration_eta, 0.1, "The initial value of learning rate (eta) used for calibration." ) DDX_PARAM_FLOAT( "calibration.rate", opt->calibration_rate, 2., "The rate of increase/decrease of learning rate for calibration." ) DDX_PARAM_INT( "calibration.samples", opt->calibration_samples, 1000, "The number of instances used for calibration." ) DDX_PARAM_INT( "calibration.candidates", opt->calibration_candidates, 10, "The number of candidates of learning rate." ) DDX_PARAM_INT( "calibration.max_trials", opt->calibration_max_trials, 20, "The maximum number of trials of learning rates for calibration." ) END_PARAM_MAP() return 0; } void crfsuite_train_l2sgd_init(crfsuite_params_t* params) { exchange_options(params, NULL, 0); } int crfsuite_train_l2sgd( encoder_t *gm, dataset_t *trainset, dataset_t *testset, crfsuite_params_t *params, logging_t *lg, floatval_t **ptr_w ) { int ret = 0; floatval_t *w = NULL; clock_t clk_begin; floatval_t loss = 0; const int N = trainset->num_instances; const int K = gm->num_features; const int T = gm->cap_items; training_option_t opt; /* Obtain parameter values. */ exchange_options(params, &opt, -1); /* Allocate arrays. */ w = (floatval_t*)calloc(sizeof(floatval_t), K); if (w == NULL) { ret = CRFSUITEERR_OUTOFMEMORY; goto error_exit; } opt.lambda = 2. * opt.c2 / N; logging(lg, "Stochastic Gradient Descent (SGD)\n"); logging(lg, "c2: %f\n", opt.c2); logging(lg, "max_iterations: %d\n", opt.max_iterations); logging(lg, "period: %d\n", opt.period); logging(lg, "delta: %f\n", opt.delta); logging(lg, "\n"); clk_begin = clock(); /* Calibrate the training rate (eta). */ opt.t0 = l2sgd_calibration(gm, trainset, w, lg, &opt); /* Perform stochastic gradient descent. */ ret = l2sgd( gm, trainset, testset, w, lg, N, opt.t0, opt.lambda, opt.max_iterations, 0, opt.period, opt.delta, &loss ); logging(lg, "Loss: %f\n", loss); logging(lg, "Total seconds required for training: %.3f\n", (clock() - clk_begin) / (double)CLOCKS_PER_SEC); logging(lg, "\n"); *ptr_w = w; return ret; error_exit: free(w); return ret; }