/** * LIBLINEAR interface for Numo::NArray */ #include "liblinearext.h" VALUE mNumo; VALUE mLiblinear; void print_null(const char *s) {} /** * Train the model according to the given training data. * * @overload train(x, y, param) -> Hash * @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for training the model. * @param y [Numo::DFloat] (shape: [n_samples]) The labels or target values for samples. * @param param [Hash] The parameters of a model. * * @example * require 'numo/liblinear' * * # Prepare training dataset. * x = Numo::DFloat[[-0.8, 1.0], [-0.5, 0.8], [0.9, -0.8], [0.8, -0.7]] * y = Numo::Int32[-1, -1, 1, 1] * * # Train L2-regularized L2-loss support vector classifier. * param = { * solver_type: Numo::Liblinear::SolverType::L2R_L2LOSS_SVC_DUAL, * C: 0.1, * random_seed: 1 * } * model = Numo::Liblinear.train(x, y, param) * * # Predict labels of test data. * x_test = Numo::DFloat[[-0.7, 0.9], [0.5, -0.4]] * result = Numo::Liblinear.predict(x_test, param, model) * p result * # Numo::DFloat#shape=[2] * # [-1, 1] * * @raise [ArgumentError] If the sample array is not 2-dimensional, the label array is not 1-dimensional, * the sample array and label array do not have the same number of samples, or * the hyperparameter has an invalid value, this error is raised. * @return [Hash] The model obtained from the training procedure. */ static VALUE numo_liblinear_train(VALUE self, VALUE x_val, VALUE y_val, VALUE param_hash) { struct problem* problem; struct parameter* param; struct model* model; narray_t* x_nary; narray_t* y_nary; char* err_msg; VALUE random_seed; VALUE verbose; VALUE model_hash; if (CLASS_OF(x_val) != numo_cDFloat) { x_val = rb_funcall(numo_cDFloat, rb_intern("cast"), 1, x_val); } if (CLASS_OF(y_val) != numo_cDFloat) { y_val = rb_funcall(numo_cDFloat, rb_intern("cast"), 1, y_val); } if (!RTEST(nary_check_contiguous(x_val))) { x_val = nary_dup(x_val); } if (!RTEST(nary_check_contiguous(y_val))) { y_val = nary_dup(y_val); } GetNArray(x_val, x_nary); GetNArray(y_val, y_nary); if (NA_NDIM(x_nary) != 2) { rb_raise(rb_eArgError, "Expect samples to be 2-D array."); return Qnil; } if (NA_NDIM(y_nary) != 1) { rb_raise(rb_eArgError, "Expect label or target values to be 1-D arrray."); return Qnil; } if (NA_SHAPE(x_nary)[0] != NA_SHAPE(y_nary)[0]) { rb_raise(rb_eArgError, "Expect to have the same number of samples for samples and labels."); return Qnil; } random_seed = rb_hash_aref(param_hash, ID2SYM(rb_intern("random_seed"))); if (!NIL_P(random_seed)) { srand(NUM2UINT(random_seed)); } param = rb_hash_to_parameter(param_hash); problem = dataset_to_problem(x_val, y_val); err_msg = check_parameter(problem, param); if (err_msg) { xfree_problem(problem); xfree_parameter(param); rb_raise(rb_eArgError, "Invalid LIBLINEAR parameter is given: %s", err_msg); return Qnil; } verbose = rb_hash_aref(param_hash, ID2SYM(rb_intern("verbose"))); if (verbose != Qtrue) { set_print_string_function(print_null); } model = train(problem, param); model_hash = model_to_rb_hash(model); free_and_destroy_model(&model); xfree_problem(problem); xfree_parameter(param); return model_hash; } /** * Perform cross validation under given parameters. The given samples are separated to n_fols folds. * The predicted labels or values in the validation process are returned. * * @overload cv(x, y, param, n_folds) -> Numo::DFloat * @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to be used for training the model. * @param y [Numo::DFloat] (shape: [n_samples]) The labels or target values for samples. * @param param [Hash] The parameters of a model. * @param n_folds [Integer] The number of folds. * * @example * require 'numo/liblinear' * * # x: samples * # y: labels * * # Define parameters of L2-regularized L2-loss support vector classification. * param = { * solver_type: Numo::Liblinear::SolverType::L2R_L2LOSS_SVC_DUAL, * C: 1, * random_seed: 1, * verbose: true * } * * # Perform 5-cross validation. * n_folds = 5 * res = Numo::Liblinear::cv(x, y, param, n_folds) * * # Print mean accuracy. * mean_accuracy = y.eq(res).count.fdiv(y.size) * puts "Accuracy: %.1f %%" % (100 * mean_accuracy) * * @raise [ArgumentError] If the sample array is not 2-dimensional, the label array is not 1-dimensional, * the sample array and label array do not have the same number of samples, or * the hyperparameter has an invalid value, this error is raised. * @return [Numo::DFloat] (shape: [n_samples]) The predicted class label or value of each sample. */ static VALUE numo_liblinear_cross_validation(VALUE self, VALUE x_val, VALUE y_val, VALUE param_hash, VALUE nr_folds) { const int n_folds = NUM2INT(nr_folds); size_t t_shape[1]; VALUE t_val; double* t_pt; narray_t* x_nary; narray_t* y_nary; char* err_msg; VALUE random_seed; VALUE verbose; struct problem* problem; struct parameter* param; if (CLASS_OF(x_val) != numo_cDFloat) { x_val = rb_funcall(numo_cDFloat, rb_intern("cast"), 1, x_val); } if (CLASS_OF(y_val) != numo_cDFloat) { y_val = rb_funcall(numo_cDFloat, rb_intern("cast"), 1, y_val); } if (!RTEST(nary_check_contiguous(x_val))) { x_val = nary_dup(x_val); } if (!RTEST(nary_check_contiguous(y_val))) { y_val = nary_dup(y_val); } GetNArray(x_val, x_nary); GetNArray(y_val, y_nary); if (NA_NDIM(x_nary) != 2) { rb_raise(rb_eArgError, "Expect samples to be 2-D array."); return Qnil; } if (NA_NDIM(y_nary) != 1) { rb_raise(rb_eArgError, "Expect label or target values to be 1-D arrray."); return Qnil; } if (NA_SHAPE(x_nary)[0] != NA_SHAPE(y_nary)[0]) { rb_raise(rb_eArgError, "Expect to have the same number of samples for samples and labels."); return Qnil; } random_seed = rb_hash_aref(param_hash, ID2SYM(rb_intern("random_seed"))); if (!NIL_P(random_seed)) { srand(NUM2UINT(random_seed)); } param = rb_hash_to_parameter(param_hash); problem = dataset_to_problem(x_val, y_val); err_msg = check_parameter(problem, param); if (err_msg) { xfree_problem(problem); xfree_parameter(param); rb_raise(rb_eArgError, "Invalid LIBLINEAR parameter is given: %s", err_msg); return Qnil; } t_shape[0] = problem->l; t_val = rb_narray_new(numo_cDFloat, 1, t_shape); t_pt = (double*)na_get_pointer_for_write(t_val); verbose = rb_hash_aref(param_hash, ID2SYM(rb_intern("verbose"))); if (verbose != Qtrue) { set_print_string_function(print_null); } cross_validation(problem, param, n_folds, t_pt); xfree_problem(problem); xfree_parameter(param); return t_val; } /** * Predict class labels or values for given samples. * * @overload predict(x, param, model) -> Numo::DFloat * @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the scores. * @param param [Hash] The parameters of the trained model. * @param model [Hash] The model obtained from the training procedure. * * @raise [ArgumentError] If the sample array is not 2-dimensional, this error is raised. * @return [Numo::DFloat] (shape: [n_samples]) The predicted class label or value of each sample. */ static VALUE numo_liblinear_predict(VALUE self, VALUE x_val, VALUE param_hash, VALUE model_hash) { struct parameter* param; struct model* model; struct feature_node* x_nodes; narray_t* x_nary; double* x_pt; size_t y_shape[1]; VALUE y_val; double* y_pt; int i, j; int n_samples; int n_features; /* Obtain C data structures. */ if (CLASS_OF(x_val) != numo_cDFloat) { x_val = rb_funcall(numo_cDFloat, rb_intern("cast"), 1, x_val); } if (!RTEST(nary_check_contiguous(x_val))) { x_val = nary_dup(x_val); } GetNArray(x_val, x_nary); if (NA_NDIM(x_nary) != 2) { rb_raise(rb_eArgError, "Expect samples to be 2-D array."); return Qnil; } param = rb_hash_to_parameter(param_hash); model = rb_hash_to_model(model_hash); model->param = *param; /* Initialize some variables. */ n_samples = (int)NA_SHAPE(x_nary)[0]; n_features = (int)NA_SHAPE(x_nary)[1]; y_shape[0] = n_samples; y_val = rb_narray_new(numo_cDFloat, 1, y_shape); y_pt = (double*)na_get_pointer_for_write(y_val); x_pt = (double*)na_get_pointer_for_read(x_val); /* Predict values. */ for (i = 0; i < n_samples; i++) { x_nodes = dbl_vec_to_node(&x_pt[i * n_features], n_features); y_pt[i] = predict(model, x_nodes); xfree(x_nodes); } xfree_model(model); xfree_parameter(param); return y_val; } /** * Calculate decision values for given samples. * * @overload decision_function(x, param, model) -> Numo::DFloat * @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to calculate the scores. * @param param [Hash] The parameters of the trained model. * @param model [Hash] The model obtained from the training procedure. * * @raise [ArgumentError] If the sample array is not 2-dimensional, this error is raised. * @return [Numo::DFloat] (shape: [n_samples, n_classes]) The decision value of each sample. */ static VALUE numo_liblinear_decision_function(VALUE self, VALUE x_val, VALUE param_hash, VALUE model_hash) { struct parameter* param; struct model* model; struct feature_node* x_nodes; narray_t* x_nary; double* x_pt; size_t y_shape[2]; VALUE y_val; double* y_pt; double* dec_values; int y_cols; int i, j; int n_samples; int n_features; /* Obtain C data structures. */ if (CLASS_OF(x_val) != numo_cDFloat) { x_val = rb_funcall(numo_cDFloat, rb_intern("cast"), 1, x_val); } if (!RTEST(nary_check_contiguous(x_val))) { x_val = nary_dup(x_val); } GetNArray(x_val, x_nary); if (NA_NDIM(x_nary) != 2) { rb_raise(rb_eArgError, "Expect samples to be 2-D array."); return Qnil; } param = rb_hash_to_parameter(param_hash); model = rb_hash_to_model(model_hash); model->param = *param; /* Initialize some variables. */ n_samples = (int)NA_SHAPE(x_nary)[0]; n_features = (int)NA_SHAPE(x_nary)[1]; if (model->nr_class == 2 && model->param.solver_type != MCSVM_CS) { y_shape[0] = n_samples; y_shape[1] = 1; y_val = rb_narray_new(numo_cDFloat, 1, y_shape); } else { y_shape[0] = n_samples; y_shape[1] = model->nr_class; y_val = rb_narray_new(numo_cDFloat, 2, y_shape); } x_pt = (double*)na_get_pointer_for_read(x_val); y_pt = (double*)na_get_pointer_for_write(y_val); /* Predict values. */ if (model->nr_class == 2 && model->param.solver_type != MCSVM_CS) { for (i = 0; i < n_samples; i++) { x_nodes = dbl_vec_to_node(&x_pt[i * n_features], n_features); predict_values(model, x_nodes, &y_pt[i]); xfree(x_nodes); } } else { y_cols = (int)y_shape[1]; dec_values = ALLOC_N(double, y_cols); for (i = 0; i < n_samples; i++) { x_nodes = dbl_vec_to_node(&x_pt[i * n_features], n_features); predict_values(model, x_nodes, dec_values); xfree(x_nodes); for (j = 0; j < y_cols; j++) { y_pt[i * y_cols + j] = dec_values[j]; } } xfree(dec_values); } xfree_model(model); xfree_parameter(param); return y_val; } /** * Predict class probability for given samples. * The model must have probability information calcualted in training procedure. * The method supports only the logistic regression. * * @overload predict_proba(x, param, model) -> Numo::DFloat * @param x [Numo::DFloat] (shape: [n_samples, n_features]) The samples to predict the class probabilities. * @param param [Hash] The parameters of the trained Logistic Regression model. * @param model [Hash] The model obtained from the training procedure. * * @raise [ArgumentError] If the sample array is not 2-dimensional, this error is raised. * @return [Numo::DFloat] (shape: [n_samples, n_classes]) Predicted probablity of each class per sample. */ static VALUE numo_liblinear_predict_proba(VALUE self, VALUE x_val, VALUE param_hash, VALUE model_hash) { struct parameter* param; struct model* model; struct feature_node* x_nodes; narray_t* x_nary; double* x_pt; size_t y_shape[2]; VALUE y_val = Qnil; double* y_pt; double* probs; int i, j; int n_samples; int n_features; GetNArray(x_val, x_nary); if (NA_NDIM(x_nary) != 2) { rb_raise(rb_eArgError, "Expect samples to be 2-D array."); return Qnil; } param = rb_hash_to_parameter(param_hash); model = rb_hash_to_model(model_hash); model->param = *param; if (model->param.solver_type == L2R_LR || model->param.solver_type == L1R_LR || model->param.solver_type == L2R_LR_DUAL) { /* Obtain C data structures. */ if (CLASS_OF(x_val) != numo_cDFloat) { x_val = rb_funcall(numo_cDFloat, rb_intern("cast"), 1, x_val); } if (!RTEST(nary_check_contiguous(x_val))) { x_val = nary_dup(x_val); } /* Initialize some variables. */ n_samples = (int)NA_SHAPE(x_nary)[0]; n_features = (int)NA_SHAPE(x_nary)[1]; y_shape[0] = n_samples; y_shape[1] = model->nr_class; y_val = rb_narray_new(numo_cDFloat, 2, y_shape); x_pt = (double*)na_get_pointer_for_read(x_val); y_pt = (double*)na_get_pointer_for_write(y_val); /* Predict values. */ probs = ALLOC_N(double, model->nr_class); for (i = 0; i < n_samples; i++) { x_nodes = dbl_vec_to_node(&x_pt[i * n_features], n_features); predict_probability(model, x_nodes, probs); xfree(x_nodes); for (j = 0; j < model->nr_class; j++) { y_pt[i * model->nr_class + j] = probs[j]; } } xfree(probs); } xfree_model(model); xfree_parameter(param); return y_val; } /** * Load the parameters and model from a text file with LIBLINEAR format. * * @overload load_model(filename) -> Array * @param filename [String] The path to a file to load. * * @raise [IOError] This error raises when failed to load the model file. * @return [Array] Array contains the parameters and model. */ static VALUE numo_liblinear_load_model(VALUE self, VALUE filename) { char* filename_ = StringValuePtr(filename); struct model* model = load_model(filename_); VALUE res = rb_ary_new2(2); VALUE param_hash = Qnil; VALUE model_hash = Qnil; if (model == NULL) { rb_raise(rb_eIOError, "Failed to load file '%s'", filename_); return Qnil; } if (model) { param_hash = parameter_to_rb_hash(&(model->param)); model_hash = model_to_rb_hash(model); free_and_destroy_model(&model); } rb_ary_store(res, 0, param_hash); rb_ary_store(res, 1, model_hash); return res; } /** * Save the parameters and model as a text file with LIBLINEAR format. The saved file can be used with the liblinear tools. * Note that the save_model saves only the parameters necessary for estimation with the trained model. * * @overload save_model(filename, param, model) -> Boolean * @param filename [String] The path to a file to save. * @param param [Hash] The parameters of the trained model. * @param model [Hash] The model obtained from the training procedure. * * @raise [IOError] This error raises when failed to save the model file. * @return [Boolean] true on success, or false if an error occurs. */ static VALUE numo_liblinear_save_model(VALUE self, VALUE filename, VALUE param_hash, VALUE model_hash) { char* filename_ = StringValuePtr(filename); struct parameter* param = rb_hash_to_parameter(param_hash); struct model* model = rb_hash_to_model(model_hash); int res; model->param = *param; res = save_model(filename_, model); xfree_model(model); xfree_parameter(param); if (res < 0) { rb_raise(rb_eIOError, "Failed to save file '%s'", filename_); return Qfalse; } return Qtrue; } void Init_liblinearext() { rb_require("numo/narray"); /** * Document-module: Numo * Numo is the top level namespace of NUmerical MOdules for Ruby. */ mNumo = rb_define_module("Numo"); /** * Document-module: Numo::Liblinear * Numo::Liblinear is a binding library for LIBLINEAR that handles dataset with Numo::NArray. */ mLiblinear = rb_define_module_under(mNumo, "Liblinear"); /* The version of LIBLINEAR used in backgroud library. */ rb_define_const(mLiblinear, "LIBLINEAR_VERSION", INT2NUM(LIBLINEAR_VERSION)); rb_define_module_function(mLiblinear, "train", numo_liblinear_train, 3); rb_define_module_function(mLiblinear, "cv", numo_liblinear_cross_validation, 4); rb_define_module_function(mLiblinear, "predict", numo_liblinear_predict, 3); rb_define_module_function(mLiblinear, "decision_function", numo_liblinear_decision_function, 3); rb_define_module_function(mLiblinear, "predict_proba", numo_liblinear_predict_proba, 3); rb_define_module_function(mLiblinear, "load_model", numo_liblinear_load_model, 1); rb_define_module_function(mLiblinear, "save_model", numo_liblinear_save_model, 3); rb_init_solver_type_module(); }