lib/xgboost.rb in xgb-0.8.0 vs lib/xgboost.rb in xgb-0.9.0

- old
+ new

@@ -2,13 +2,21 @@ require "ffi" # modules require_relative "xgboost/utils" require_relative "xgboost/booster" +require_relative "xgboost/callback_container" +require_relative "xgboost/cv_pack" require_relative "xgboost/dmatrix" +require_relative "xgboost/packed_booster" require_relative "xgboost/version" +# callbacks +require_relative "xgboost/training_callback" +require_relative "xgboost/early_stopping" +require_relative "xgboost/evaluation_monitor" + # scikit-learn API require_relative "xgboost/model" require_relative "xgboost/classifier" require_relative "xgboost/ranker" require_relative "xgboost/regressor" @@ -42,138 +50,117 @@ # friendlier error message autoload :FFI, "xgboost/ffi" class << self - def train(params, dtrain, num_boost_round: 10, evals: nil, early_stopping_rounds: nil, verbose_eval: true) - booster = Booster.new(params: params) - num_feature = dtrain.num_col - booster.set_param("num_feature", num_feature) - booster.feature_names = dtrain.feature_names - booster.feature_types = dtrain.feature_types + def train( + params, + dtrain, + num_boost_round: 10, + evals: nil, + maximize: nil, + early_stopping_rounds: nil, + evals_result: nil, + verbose_eval: true, + callbacks: nil + ) + callbacks = callbacks.nil? ? [] : callbacks.dup evals ||= [] + bst = Booster.new(params: params, cache: [dtrain] + evals.map { |d| d[0] }) + + if verbose_eval + verbose_eval = verbose_eval == true ? 1 : verbose_eval + callbacks << EvaluationMonitor.new(period: verbose_eval) + end if early_stopping_rounds - best_score = nil - best_iter = nil - best_message = nil + callbacks << EarlyStopping.new(rounds: early_stopping_rounds, maximize: maximize) end + cb_container = CallbackContainer.new(callbacks) - num_boost_round.times do |iteration| - booster.update(dtrain, iteration) + bst = cb_container.before_training(bst) - if evals.any? - message = booster.eval_set(evals, iteration) - res = message.split.map { |x| x.split(":") }[1..-1].map { |k, v| [k, v.to_f] } + num_boost_round.times do |i| + break if cb_container.before_iteration(bst, i, dtrain, evals) + bst.update(dtrain, i) + break if cb_container.after_iteration(bst, i, dtrain, evals) + end - if early_stopping_rounds && iteration == 0 - metric = res[-1][0] - puts "Will train until #{metric} hasn't improved in #{early_stopping_rounds.to_i} rounds." if verbose_eval - end + bst = cb_container.after_training(bst) - puts message if verbose_eval - score = res[-1][1] - - # TODO handle larger better - if best_score.nil? || score < best_score - best_score = score - best_iter = iteration - best_message = message - elsif early_stopping_rounds && iteration - best_iter >= early_stopping_rounds - booster.best_iteration = best_iter - puts "Stopping. Best iteration:\n#{best_message}" if verbose_eval - break - end - end + if !evals_result.nil? + evals_result.merge!(cb_container.history) end - booster + bst end - def cv(params, dtrain, num_boost_round: 10, nfold: 3, seed: 0, shuffle: true, verbose_eval: nil, show_stdv: true, early_stopping_rounds: nil) - rand_idx = (0...dtrain.num_row).to_a - rand_idx.shuffle!(random: Random.new(seed)) if shuffle + def cv( + params, + dtrain, + num_boost_round: 10, + nfold: 3, + maximize: nil, + early_stopping_rounds: nil, + verbose_eval: nil, + show_stdv: true, + seed: 0, + callbacks: nil, + shuffle: true + ) + results = {} + cvfolds = + mknfold( + dall: dtrain, + param: params, + nfold: nfold, + seed: seed, + shuffle: shuffle + ) - kstep = (rand_idx.size / nfold.to_f).ceil - test_id = rand_idx.each_slice(kstep).to_a[0...nfold] - train_id = [] - nfold.times do |i| - idx = test_id.dup - idx.delete_at(i) - train_id << idx.flatten - end + callbacks = callbacks.nil? ? [] : callbacks.dup - folds = train_id.zip(test_id) - cvfolds = [] - folds.each do |(train_idx, test_idx)| - fold_dtrain = dtrain.slice(train_idx) - fold_dvalid = dtrain.slice(test_idx) - booster = Booster.new(params: params) - booster.set_param("num_feature", dtrain.num_col) - cvfolds << [booster, fold_dtrain, fold_dvalid] + if verbose_eval + verbose_eval = verbose_eval == true ? 1 : verbose_eval + callbacks << EvaluationMonitor.new(period: verbose_eval, show_stdv: show_stdv) end - - eval_hist = {} - if early_stopping_rounds - best_score = nil - best_iter = nil + callbacks << EarlyStopping.new(rounds: early_stopping_rounds, maximize: maximize) end + callbacks_container = CallbackContainer.new(callbacks, is_cv: true) - num_boost_round.times do |iteration| - scores = {} + booster = PackedBooster.new(cvfolds) + callbacks_container.before_training(booster) - cvfolds.each do |(booster, fold_dtrain, fold_dvalid)| - booster.update(fold_dtrain, iteration) - message = booster.eval_set([[fold_dtrain, "train"], [fold_dvalid, "test"]], iteration) + num_boost_round.times do |i| + break if callbacks_container.before_iteration(booster, i, dtrain, nil) + booster.update(i) - res = message.split.map { |x| x.split(":") }[1..-1].map { |k, v| [k, v.to_f] } - res.each do |k, v| - (scores[k] ||= []) << v + should_break = callbacks_container.after_iteration(booster, i, dtrain, nil) + res = callbacks_container.aggregated_cv + res.each do |key, mean, std| + if !results.include?(key + "-mean") + results[key + "-mean"] = [] end - end - - message_parts = ["[#{iteration}]"] - - last_mean = nil - means = {} - scores.each do |eval_name, vals| - mean = mean(vals) - stdev = stdev(vals) - - (eval_hist["#{eval_name}-mean"] ||= []) << mean - (eval_hist["#{eval_name}-std"] ||= []) << stdev - - means[eval_name] = mean - last_mean = mean - - if show_stdv - message_parts << "%s:%g+%g" % [eval_name, mean, stdev] - else - message_parts << "%s:%g" % [eval_name, mean] + if !results.include?(key + "-std") + results[key + "-std"] = [] end + results[key + "-mean"] << mean + results[key + "-std"] << std end - if early_stopping_rounds - score = last_mean - # TODO handle larger better - if best_score.nil? || score < best_score - best_score = score - best_iter = iteration - elsif iteration - best_iter >= early_stopping_rounds - eval_hist.each_key do |k| - eval_hist[k] = eval_hist[k][0..best_iter] - end - break + if should_break + results.keys.each do |k| + results[k] = results[k][..booster.best_iteration] end + break end - - # put at end to keep output consistent with Python - puts message_parts.join("\t") if verbose_eval end - eval_hist + callbacks_container.after_training(booster) + + results end def lib_version major = ::FFI::MemoryPointer.new(:int) minor = ::FFI::MemoryPointer.new(:int) @@ -182,20 +169,28 @@ "#{major.read_int}.#{minor.read_int}.#{patch.read_int}" end private - def mean(arr) - arr.sum / arr.size.to_f - end + def mknfold(dall:, param:, nfold:, seed:, shuffle:) + rand_idx = (0...dall.num_row).to_a + rand_idx.shuffle!(random: Random.new(seed)) if shuffle - # don't subtract one from arr.size - def stdev(arr) - m = mean(arr) - sum = 0 - arr.each do |v| - sum += (v - m) ** 2 + kstep = (rand_idx.size / nfold.to_f).ceil + out_idset = rand_idx.each_slice(kstep).to_a[0...nfold] + in_idset = [] + nfold.times do |i| + idx = out_idset.dup + idx.delete_at(i) + in_idset << idx.flatten end - Math.sqrt(sum / arr.size) + + ret = [] + nfold.times do |k| + fold_dtrain = dall.slice(in_idset[k]) + fold_dvalid = dall.slice(out_idset[k]) + ret << CVPack.new(fold_dtrain, fold_dvalid, param) + end + ret end end end