lib/xgboost/dmatrix.rb in xgb-0.8.0 vs lib/xgboost/dmatrix.rb in xgb-0.9.0
- old
+ new
@@ -1,81 +1,74 @@
module XGBoost
class DMatrix
- attr_reader :data, :feature_names, :feature_types
+ include Utils
+ attr_reader :handle
+
def initialize(data, label: nil, weight: nil, missing: Float::NAN)
- @data = data
+ if data.is_a?(::FFI::AutoPointer)
+ @handle = data
+ return
+ end
- @handle = ::FFI::MemoryPointer.new(:pointer)
-
- if data
- if matrix?(data)
- nrow = data.row_count
- ncol = data.column_count
- flat_data = data.to_a.flatten
- elsif daru?(data)
- nrow, ncol = data.shape
- flat_data = data.map_rows(&:to_a).flatten
- @feature_names = data.each_vector.map(&:name)
- @feature_types =
- data.each_vector.map(&:db_type).map do |v|
- case v
- when "INTEGER"
- "int"
- when "DOUBLE"
- "float"
- else
- raise Error, "Unknown feature type: #{v}"
- end
+ if matrix?(data)
+ nrow = data.row_count
+ ncol = data.column_count
+ flat_data = data.to_a.flatten
+ elsif daru?(data)
+ nrow, ncol = data.shape
+ flat_data = data.map_rows(&:to_a).flatten
+ feature_names = data.each_vector.map(&:name)
+ feature_types =
+ data.each_vector.map(&:db_type).map do |v|
+ case v
+ when "INTEGER"
+ "int"
+ when "DOUBLE"
+ "float"
+ else
+ raise Error, "Unknown feature type: #{v}"
end
- elsif numo?(data)
- nrow, ncol = data.shape
- elsif rover?(data)
- nrow, ncol = data.shape
- @feature_names = data.keys
- data = data.to_numo
- else
- nrow = data.count
- ncol = data.first.count
- if !data.all? { |r| r.size == ncol }
- # TODO raise ArgumentError in 0.8.0
- raise IndexError, "Rows have different sizes"
end
- flat_data = data.flatten
+ elsif numo?(data)
+ nrow, ncol = data.shape
+ elsif rover?(data)
+ nrow, ncol = data.shape
+ feature_names = data.keys
+ data = data.to_numo
+ else
+ nrow = data.count
+ ncol = data.first.count
+ if !data.all? { |r| r.size == ncol }
+ raise ArgumentError, "Rows have different sizes"
end
+ flat_data = data.flatten
+ end
- c_data = ::FFI::MemoryPointer.new(:float, nrow * ncol)
- if numo?(data)
- c_data.write_bytes(data.cast_to(Numo::SFloat).to_string)
- else
- handle_missing(flat_data, missing)
- c_data.write_array_of_float(flat_data)
- end
- check_result FFI.XGDMatrixCreateFromMat(c_data, nrow, ncol, missing, @handle)
+ c_data = ::FFI::MemoryPointer.new(:float, nrow * ncol)
+ if numo?(data)
+ c_data.write_bytes(data.cast_to(Numo::SFloat).to_string)
+ else
+ handle_missing(flat_data, missing)
+ c_data.write_array_of_float(flat_data)
+ end
- ObjectSpace.define_finalizer(@handle, self.class.finalize(handle_pointer.to_i))
+ out = ::FFI::MemoryPointer.new(:pointer)
+ check_call FFI.XGDMatrixCreateFromMat(c_data, nrow, ncol, missing, out)
+ @handle = ::FFI::AutoPointer.new(out.read_pointer, FFI.method(:XGDMatrixFree))
- @feature_names ||= ncol.times.map { |i| "f#{i}" }
- end
+ self.feature_names = feature_names || ncol.times.map { |i| "f#{i}" }
+ self.feature_types = feature_types if feature_types
self.label = label if label
self.weight = weight if weight
end
- def self.finalize(addr)
- # must use proc instead of stabby lambda
- proc { FFI.XGDMatrixFree(::FFI::Pointer.new(:pointer, addr)) }
+ def save_binary(fname, silent: true)
+ check_call FFI.XGDMatrixSaveBinary(handle, fname, silent ? 1 : 0)
end
- def label
- float_info("label")
- end
-
- def weight
- float_info("weight")
- end
-
def label=(label)
set_float_info("label", label)
end
def weight=(weight)
@@ -83,62 +76,183 @@
end
def group=(group)
c_data = ::FFI::MemoryPointer.new(:int, group.size)
c_data.write_array_of_int(group)
- check_result FFI.XGDMatrixSetUIntInfo(handle_pointer, "group", c_data, group.size)
+ check_call FFI.XGDMatrixSetUIntInfo(handle, "group", c_data, group.size)
end
+ def label
+ float_info("label")
+ end
+
+ def weight
+ float_info("weight")
+ end
+
def num_row
out = ::FFI::MemoryPointer.new(:uint64)
- check_result FFI.XGDMatrixNumRow(handle_pointer, out)
- read_uint64(out)
+ check_call FFI.XGDMatrixNumRow(handle, out)
+ out.read_uint64
end
def num_col
out = ::FFI::MemoryPointer.new(:uint64)
- check_result FFI.XGDMatrixNumCol(handle_pointer, out)
- read_uint64(out)
+ check_call FFI.XGDMatrixNumCol(handle, out)
+ out.read_uint64
end
+ def num_nonmissing
+ out = ::FFI::MemoryPointer.new(:uint64)
+ check_call FFI.XGDMatrixNumNonMissing(handle, out)
+ out.read_uint64
+ end
+
+ def data_split_mode
+ out = ::FFI::MemoryPointer.new(:uint64)
+ check_call FFI.XGDMatrixDataSplitMode(handle, out)
+ out.read_uint64 == 0 ? :row : :col
+ end
+
def slice(rindex)
- res = DMatrix.new(nil)
idxset = ::FFI::MemoryPointer.new(:int, rindex.count)
idxset.write_array_of_int(rindex)
- check_result FFI.XGDMatrixSliceDMatrix(handle_pointer, idxset, rindex.size, res.handle)
- res
+ out = ::FFI::MemoryPointer.new(:pointer)
+ check_call FFI.XGDMatrixSliceDMatrix(handle, idxset, rindex.size, out)
+
+ handle = ::FFI::AutoPointer.new(out.read_pointer, FFI.method(:XGDMatrixFree))
+ DMatrix.new(handle)
end
- def save_binary(fname, silent: true)
- check_result FFI.XGDMatrixSaveBinary(handle_pointer, fname, silent ? 1 : 0)
+ def feature_names
+ length = ::FFI::MemoryPointer.new(:uint64)
+ sarr = ::FFI::MemoryPointer.new(:pointer)
+ check_call(
+ FFI.XGDMatrixGetStrFeatureInfo(
+ handle,
+ "feature_name",
+ length,
+ sarr
+ )
+ )
+ feature_names = from_cstr_to_rbstr(sarr, length)
+ feature_names.empty? ? nil : feature_names
end
- def handle
- @handle
+ def feature_names=(feature_names)
+ if feature_names.nil?
+ check_call(
+ FFI.XGDMatrixSetStrFeatureInfo(
+ handle, "feature_name", nil, 0
+ )
+ )
+ return
+ end
+
+ # validate feature name
+ feature_names =
+ validate_feature_info(
+ feature_names,
+ num_col,
+ data_split_mode == :col,
+ "feature names"
+ )
+ if feature_names.length != feature_names.uniq.length
+ raise ArgumentError, "feature_names must be unique"
+ end
+
+ # prohibit the use symbols that may affect parsing. e.g. []<
+ if !feature_names.all? { |f| f.is_a?(String) && !["[", "]", "<"].any? { |x| f.include?(x) } }
+ raise ArgumentError, "feature_names must be string, and may not contain [, ] or <"
+ end
+
+ c_feature_names = array_of_pointers(feature_names.map { |f| string_pointer(f) })
+ check_call(
+ FFI.XGDMatrixSetStrFeatureInfo(
+ handle,
+ "feature_name",
+ c_feature_names,
+ feature_names.length
+ )
+ )
end
- def handle_pointer
- @handle.read_pointer
+ def feature_types
+ length = ::FFI::MemoryPointer.new(:uint64)
+ sarr = ::FFI::MemoryPointer.new(:pointer)
+ check_call(
+ FFI.XGDMatrixGetStrFeatureInfo(
+ handle,
+ "feature_type",
+ length,
+ sarr
+ )
+ )
+ res = from_cstr_to_rbstr(sarr, length)
+ res.empty? ? nil : res
end
+ def feature_types=(feature_types)
+ if feature_types.nil?
+ check_call(
+ FFI.XGDMatrixSetStrFeatureInfo(
+ handle, "feature_type", nil, 0
+ )
+ )
+ return
+ end
+
+ feature_types =
+ validate_feature_info(
+ feature_types,
+ num_col,
+ data_split_mode == :col,
+ "feature types"
+ )
+
+ c_feature_types = array_of_pointers(feature_types.map { |f| string_pointer(f) })
+ check_call(
+ FFI.XGDMatrixSetStrFeatureInfo(
+ handle,
+ "feature_type",
+ c_feature_types,
+ feature_types.length
+ )
+ )
+ end
+
private
def set_float_info(field, data)
data = data.to_a unless data.is_a?(Array)
c_data = ::FFI::MemoryPointer.new(:float, data.size)
c_data.write_array_of_float(data)
- check_result FFI.XGDMatrixSetFloatInfo(handle_pointer, field.to_s, c_data, data.size)
+ check_call FFI.XGDMatrixSetFloatInfo(handle, field.to_s, c_data, data.size)
end
def float_info(field)
num_row ||= num_row()
- out_len = ::FFI::MemoryPointer.new(:int)
+ out_len = ::FFI::MemoryPointer.new(:uint64)
out_dptr = ::FFI::MemoryPointer.new(:float, num_row)
- check_result FFI.XGDMatrixGetFloatInfo(handle_pointer, field, out_len, out_dptr)
+ check_call FFI.XGDMatrixGetFloatInfo(handle, field, out_len, out_dptr)
out_dptr.read_pointer.read_array_of_float(num_row)
end
+ def validate_feature_info(feature_info, n_features, is_column_split, name)
+ if !feature_info.is_a?(Array)
+ raise TypeError, "Expecting an array of strings for #{name}, got: #{feature_info.class.name}"
+ end
+ if feature_info.length != n_features && n_features != 0 && !is_column_split
+ msg = (
+ "#{name} must have the same length as the number of data columns, " +
+ "expected #{n_features}, got #{feature_info.length}"
+ )
+ raise ArgumentError, msg
+ end
+ feature_info
+ end
+
def matrix?(data)
defined?(Matrix) && data.is_a?(Matrix)
end
def daru?(data)
@@ -154,9 +268,7 @@
end
def handle_missing(data, missing)
data.map! { |v| v.nil? ? missing : v }
end
-
- include Utils
end
end