lib/xgboost.rb in xgb-0.8.0 vs lib/xgboost.rb in xgb-0.9.0
- old
+ new
@@ -2,13 +2,21 @@
require "ffi"
# modules
require_relative "xgboost/utils"
require_relative "xgboost/booster"
+require_relative "xgboost/callback_container"
+require_relative "xgboost/cv_pack"
require_relative "xgboost/dmatrix"
+require_relative "xgboost/packed_booster"
require_relative "xgboost/version"
+# callbacks
+require_relative "xgboost/training_callback"
+require_relative "xgboost/early_stopping"
+require_relative "xgboost/evaluation_monitor"
+
# scikit-learn API
require_relative "xgboost/model"
require_relative "xgboost/classifier"
require_relative "xgboost/ranker"
require_relative "xgboost/regressor"
@@ -42,138 +50,117 @@
# friendlier error message
autoload :FFI, "xgboost/ffi"
class << self
- def train(params, dtrain, num_boost_round: 10, evals: nil, early_stopping_rounds: nil, verbose_eval: true)
- booster = Booster.new(params: params)
- num_feature = dtrain.num_col
- booster.set_param("num_feature", num_feature)
- booster.feature_names = dtrain.feature_names
- booster.feature_types = dtrain.feature_types
+ def train(
+ params,
+ dtrain,
+ num_boost_round: 10,
+ evals: nil,
+ maximize: nil,
+ early_stopping_rounds: nil,
+ evals_result: nil,
+ verbose_eval: true,
+ callbacks: nil
+ )
+ callbacks = callbacks.nil? ? [] : callbacks.dup
evals ||= []
+ bst = Booster.new(params: params, cache: [dtrain] + evals.map { |d| d[0] })
+
+ if verbose_eval
+ verbose_eval = verbose_eval == true ? 1 : verbose_eval
+ callbacks << EvaluationMonitor.new(period: verbose_eval)
+ end
if early_stopping_rounds
- best_score = nil
- best_iter = nil
- best_message = nil
+ callbacks << EarlyStopping.new(rounds: early_stopping_rounds, maximize: maximize)
end
+ cb_container = CallbackContainer.new(callbacks)
- num_boost_round.times do |iteration|
- booster.update(dtrain, iteration)
+ bst = cb_container.before_training(bst)
- if evals.any?
- message = booster.eval_set(evals, iteration)
- res = message.split.map { |x| x.split(":") }[1..-1].map { |k, v| [k, v.to_f] }
+ num_boost_round.times do |i|
+ break if cb_container.before_iteration(bst, i, dtrain, evals)
+ bst.update(dtrain, i)
+ break if cb_container.after_iteration(bst, i, dtrain, evals)
+ end
- if early_stopping_rounds && iteration == 0
- metric = res[-1][0]
- puts "Will train until #{metric} hasn't improved in #{early_stopping_rounds.to_i} rounds." if verbose_eval
- end
+ bst = cb_container.after_training(bst)
- puts message if verbose_eval
- score = res[-1][1]
-
- # TODO handle larger better
- if best_score.nil? || score < best_score
- best_score = score
- best_iter = iteration
- best_message = message
- elsif early_stopping_rounds && iteration - best_iter >= early_stopping_rounds
- booster.best_iteration = best_iter
- puts "Stopping. Best iteration:\n#{best_message}" if verbose_eval
- break
- end
- end
+ if !evals_result.nil?
+ evals_result.merge!(cb_container.history)
end
- booster
+ bst
end
- def cv(params, dtrain, num_boost_round: 10, nfold: 3, seed: 0, shuffle: true, verbose_eval: nil, show_stdv: true, early_stopping_rounds: nil)
- rand_idx = (0...dtrain.num_row).to_a
- rand_idx.shuffle!(random: Random.new(seed)) if shuffle
+ def cv(
+ params,
+ dtrain,
+ num_boost_round: 10,
+ nfold: 3,
+ maximize: nil,
+ early_stopping_rounds: nil,
+ verbose_eval: nil,
+ show_stdv: true,
+ seed: 0,
+ callbacks: nil,
+ shuffle: true
+ )
+ results = {}
+ cvfolds =
+ mknfold(
+ dall: dtrain,
+ param: params,
+ nfold: nfold,
+ seed: seed,
+ shuffle: shuffle
+ )
- kstep = (rand_idx.size / nfold.to_f).ceil
- test_id = rand_idx.each_slice(kstep).to_a[0...nfold]
- train_id = []
- nfold.times do |i|
- idx = test_id.dup
- idx.delete_at(i)
- train_id << idx.flatten
- end
+ callbacks = callbacks.nil? ? [] : callbacks.dup
- folds = train_id.zip(test_id)
- cvfolds = []
- folds.each do |(train_idx, test_idx)|
- fold_dtrain = dtrain.slice(train_idx)
- fold_dvalid = dtrain.slice(test_idx)
- booster = Booster.new(params: params)
- booster.set_param("num_feature", dtrain.num_col)
- cvfolds << [booster, fold_dtrain, fold_dvalid]
+ if verbose_eval
+ verbose_eval = verbose_eval == true ? 1 : verbose_eval
+ callbacks << EvaluationMonitor.new(period: verbose_eval, show_stdv: show_stdv)
end
-
- eval_hist = {}
-
if early_stopping_rounds
- best_score = nil
- best_iter = nil
+ callbacks << EarlyStopping.new(rounds: early_stopping_rounds, maximize: maximize)
end
+ callbacks_container = CallbackContainer.new(callbacks, is_cv: true)
- num_boost_round.times do |iteration|
- scores = {}
+ booster = PackedBooster.new(cvfolds)
+ callbacks_container.before_training(booster)
- cvfolds.each do |(booster, fold_dtrain, fold_dvalid)|
- booster.update(fold_dtrain, iteration)
- message = booster.eval_set([[fold_dtrain, "train"], [fold_dvalid, "test"]], iteration)
+ num_boost_round.times do |i|
+ break if callbacks_container.before_iteration(booster, i, dtrain, nil)
+ booster.update(i)
- res = message.split.map { |x| x.split(":") }[1..-1].map { |k, v| [k, v.to_f] }
- res.each do |k, v|
- (scores[k] ||= []) << v
+ should_break = callbacks_container.after_iteration(booster, i, dtrain, nil)
+ res = callbacks_container.aggregated_cv
+ res.each do |key, mean, std|
+ if !results.include?(key + "-mean")
+ results[key + "-mean"] = []
end
- end
-
- message_parts = ["[#{iteration}]"]
-
- last_mean = nil
- means = {}
- scores.each do |eval_name, vals|
- mean = mean(vals)
- stdev = stdev(vals)
-
- (eval_hist["#{eval_name}-mean"] ||= []) << mean
- (eval_hist["#{eval_name}-std"] ||= []) << stdev
-
- means[eval_name] = mean
- last_mean = mean
-
- if show_stdv
- message_parts << "%s:%g+%g" % [eval_name, mean, stdev]
- else
- message_parts << "%s:%g" % [eval_name, mean]
+ if !results.include?(key + "-std")
+ results[key + "-std"] = []
end
+ results[key + "-mean"] << mean
+ results[key + "-std"] << std
end
- if early_stopping_rounds
- score = last_mean
- # TODO handle larger better
- if best_score.nil? || score < best_score
- best_score = score
- best_iter = iteration
- elsif iteration - best_iter >= early_stopping_rounds
- eval_hist.each_key do |k|
- eval_hist[k] = eval_hist[k][0..best_iter]
- end
- break
+ if should_break
+ results.keys.each do |k|
+ results[k] = results[k][..booster.best_iteration]
end
+ break
end
-
- # put at end to keep output consistent with Python
- puts message_parts.join("\t") if verbose_eval
end
- eval_hist
+ callbacks_container.after_training(booster)
+
+ results
end
def lib_version
major = ::FFI::MemoryPointer.new(:int)
minor = ::FFI::MemoryPointer.new(:int)
@@ -182,20 +169,28 @@
"#{major.read_int}.#{minor.read_int}.#{patch.read_int}"
end
private
- def mean(arr)
- arr.sum / arr.size.to_f
- end
+ def mknfold(dall:, param:, nfold:, seed:, shuffle:)
+ rand_idx = (0...dall.num_row).to_a
+ rand_idx.shuffle!(random: Random.new(seed)) if shuffle
- # don't subtract one from arr.size
- def stdev(arr)
- m = mean(arr)
- sum = 0
- arr.each do |v|
- sum += (v - m) ** 2
+ kstep = (rand_idx.size / nfold.to_f).ceil
+ out_idset = rand_idx.each_slice(kstep).to_a[0...nfold]
+ in_idset = []
+ nfold.times do |i|
+ idx = out_idset.dup
+ idx.delete_at(i)
+ in_idset << idx.flatten
end
- Math.sqrt(sum / arr.size)
+
+ ret = []
+ nfold.times do |k|
+ fold_dtrain = dall.slice(in_idset[k])
+ fold_dvalid = dall.slice(out_idset[k])
+ ret << CVPack.new(fold_dtrain, fold_dvalid, param)
+ end
+ ret
end
end
end