lib/xgb.rb in xgb-0.1.0 vs lib/xgb.rb in xgb-0.1.1
- old
+ new
@@ -6,21 +6,159 @@
require "xgb/booster"
require "xgb/dmatrix"
require "xgb/ffi"
require "xgb/version"
+# scikit-learn API
+require "xgb/classifier"
+require "xgb/regressor"
+
module Xgb
class Error < StandardError; end
class << self
- def train(params, dtrain, num_boost_round: 10)
+ def train(params, dtrain, num_boost_round: 10, evals: nil, early_stopping_rounds: nil, verbose_eval: true)
booster = Booster.new(params: params)
- booster.set_param("num_feature", dtrain.num_col)
+ num_feature = dtrain.num_col
+ booster.set_param("num_feature", num_feature)
+ booster.feature_names = num_feature.times.map { |i| "f#{i}" }
+ evals ||= []
+ if early_stopping_rounds
+ best_score = nil
+ best_iter = nil
+ best_message = nil
+ end
+
num_boost_round.times do |iteration|
booster.update(dtrain, iteration)
+
+ if evals.any?
+ message = booster.eval_set(evals, iteration)
+ res = message.split.map { |x| x.split(":") }[1..-1].map { |k, v| [k, v.to_f] }
+
+ if early_stopping_rounds && iteration == 0
+ metric = res[-1][0]
+ puts "Will train until #{metric} hasn't improved in #{early_stopping_rounds.to_i} rounds." if verbose_eval
+ end
+
+ puts message if verbose_eval
+ score = res[-1][1]
+
+ # TODO handle larger better
+ if best_score.nil? || score < best_score
+ best_score = score
+ best_iter = iteration
+ best_message = message
+ elsif iteration - best_iter >= early_stopping_rounds
+ booster.best_iteration = best_iter
+ puts "Stopping. Best iteration:\n#{best_message}" if verbose_eval
+ break
+ end
+ end
end
booster
+ end
+
+ def cv(params, dtrain, num_boost_round: 10, nfold: 3, seed: 0, shuffle: true, verbose_eval: nil, show_stdv: true, early_stopping_rounds: nil)
+ rand_idx = (0...dtrain.num_row).to_a
+ rand_idx.shuffle!(random: Random.new(seed)) if shuffle
+
+ kstep = (rand_idx.size / nfold.to_f).ceil
+ test_id = rand_idx.each_slice(kstep).to_a[0...nfold]
+ train_id = []
+ nfold.times do |i|
+ idx = test_id.dup
+ idx.delete_at(i)
+ train_id << idx.flatten
+ end
+
+ folds = train_id.zip(test_id)
+ cvfolds = []
+ folds.each do |(train_idx, test_idx)|
+ fold_dtrain = dtrain.slice(train_idx)
+ fold_dvalid = dtrain.slice(test_idx)
+ booster = Booster.new(params: params)
+ booster.set_param("num_feature", dtrain.num_col)
+ cvfolds << [booster, fold_dtrain, fold_dvalid]
+ end
+
+ eval_hist = {}
+
+ if early_stopping_rounds
+ best_score = nil
+ best_iter = nil
+ end
+
+ num_boost_round.times do |iteration|
+ scores = {}
+
+ cvfolds.each do |(booster, fold_dtrain, fold_dvalid)|
+ booster.update(fold_dtrain, iteration)
+ message = booster.eval_set([[fold_dtrain, "train"], [fold_dvalid, "test"]], iteration)
+
+ res = message.split.map { |x| x.split(":") }[1..-1].map { |k, v| [k, v.to_f] }
+ res.each do |k, v|
+ (scores[k] ||= []) << v
+ end
+ end
+
+ message_parts = ["[#{iteration}]"]
+
+ last_mean = nil
+ means = {}
+ scores.each do |eval_name, vals|
+ mean = mean(vals)
+ stdev = stdev(vals)
+
+ (eval_hist["#{eval_name}-mean"] ||= []) << mean
+ (eval_hist["#{eval_name}-std"] ||= []) << stdev
+
+ means[eval_name] = mean
+ last_mean = mean
+
+ if show_stdv
+ message_parts << "%s:%g+%g" % [eval_name, mean, stdev]
+ else
+ message_parts << "%s:%g" % [eval_name, mean]
+ end
+ end
+
+ if early_stopping_rounds
+ score = last_mean
+ # TODO handle larger better
+ if best_score.nil? || score < best_score
+ best_score = score
+ best_iter = iteration
+ elsif iteration - best_iter >= early_stopping_rounds
+ eval_hist.each_key do |k|
+ eval_hist[k] = eval_hist[k][0..best_iter]
+ end
+ break
+ end
+ end
+
+ # put at end to keep output consistent with Python
+ puts message_parts.join("\t") if verbose_eval
+ end
+
+ eval_hist
+ end
+
+ private
+
+ def mean(arr)
+ arr.sum / arr.size.to_f
+ end
+
+ # don't subtract one from arr.size
+ def stdev(arr)
+ m = mean(arr)
+ sum = 0
+ arr.each do |v|
+ sum += (v - m) ** 2
+ end
+ Math.sqrt(sum / arr.size)
end
end
end