lib/xgb.rb in xgb-0.1.0 vs lib/xgb.rb in xgb-0.1.1

- old
+ new

@@ -6,21 +6,159 @@ require "xgb/booster" require "xgb/dmatrix" require "xgb/ffi" require "xgb/version" +# scikit-learn API +require "xgb/classifier" +require "xgb/regressor" + module Xgb class Error < StandardError; end class << self - def train(params, dtrain, num_boost_round: 10) + def train(params, dtrain, num_boost_round: 10, evals: nil, early_stopping_rounds: nil, verbose_eval: true) booster = Booster.new(params: params) - booster.set_param("num_feature", dtrain.num_col) + num_feature = dtrain.num_col + booster.set_param("num_feature", num_feature) + booster.feature_names = num_feature.times.map { |i| "f#{i}" } + evals ||= [] + if early_stopping_rounds + best_score = nil + best_iter = nil + best_message = nil + end + num_boost_round.times do |iteration| booster.update(dtrain, iteration) + + if evals.any? + message = booster.eval_set(evals, iteration) + res = message.split.map { |x| x.split(":") }[1..-1].map { |k, v| [k, v.to_f] } + + if early_stopping_rounds && iteration == 0 + metric = res[-1][0] + puts "Will train until #{metric} hasn't improved in #{early_stopping_rounds.to_i} rounds." if verbose_eval + end + + puts message if verbose_eval + score = res[-1][1] + + # TODO handle larger better + if best_score.nil? || score < best_score + best_score = score + best_iter = iteration + best_message = message + elsif iteration - best_iter >= early_stopping_rounds + booster.best_iteration = best_iter + puts "Stopping. Best iteration:\n#{best_message}" if verbose_eval + break + end + end end booster + end + + def cv(params, dtrain, num_boost_round: 10, nfold: 3, seed: 0, shuffle: true, verbose_eval: nil, show_stdv: true, early_stopping_rounds: nil) + rand_idx = (0...dtrain.num_row).to_a + rand_idx.shuffle!(random: Random.new(seed)) if shuffle + + kstep = (rand_idx.size / nfold.to_f).ceil + test_id = rand_idx.each_slice(kstep).to_a[0...nfold] + train_id = [] + nfold.times do |i| + idx = test_id.dup + idx.delete_at(i) + train_id << idx.flatten + end + + folds = train_id.zip(test_id) + cvfolds = [] + folds.each do |(train_idx, test_idx)| + fold_dtrain = dtrain.slice(train_idx) + fold_dvalid = dtrain.slice(test_idx) + booster = Booster.new(params: params) + booster.set_param("num_feature", dtrain.num_col) + cvfolds << [booster, fold_dtrain, fold_dvalid] + end + + eval_hist = {} + + if early_stopping_rounds + best_score = nil + best_iter = nil + end + + num_boost_round.times do |iteration| + scores = {} + + cvfolds.each do |(booster, fold_dtrain, fold_dvalid)| + booster.update(fold_dtrain, iteration) + message = booster.eval_set([[fold_dtrain, "train"], [fold_dvalid, "test"]], iteration) + + res = message.split.map { |x| x.split(":") }[1..-1].map { |k, v| [k, v.to_f] } + res.each do |k, v| + (scores[k] ||= []) << v + end + end + + message_parts = ["[#{iteration}]"] + + last_mean = nil + means = {} + scores.each do |eval_name, vals| + mean = mean(vals) + stdev = stdev(vals) + + (eval_hist["#{eval_name}-mean"] ||= []) << mean + (eval_hist["#{eval_name}-std"] ||= []) << stdev + + means[eval_name] = mean + last_mean = mean + + if show_stdv + message_parts << "%s:%g+%g" % [eval_name, mean, stdev] + else + message_parts << "%s:%g" % [eval_name, mean] + end + end + + if early_stopping_rounds + score = last_mean + # TODO handle larger better + if best_score.nil? || score < best_score + best_score = score + best_iter = iteration + elsif iteration - best_iter >= early_stopping_rounds + eval_hist.each_key do |k| + eval_hist[k] = eval_hist[k][0..best_iter] + end + break + end + end + + # put at end to keep output consistent with Python + puts message_parts.join("\t") if verbose_eval + end + + eval_hist + end + + private + + def mean(arr) + arr.sum / arr.size.to_f + end + + # don't subtract one from arr.size + def stdev(arr) + m = mean(arr) + sum = 0 + arr.each do |v| + sum += (v - m) ** 2 + end + Math.sqrt(sum / arr.size) end end end