lib/rumale/preprocessing/one_hot_encoder.rb in rumale-0.13.1 vs lib/rumale/preprocessing/one_hot_encoder.rb in rumale-0.13.2

- old
+ new

@@ -25,31 +25,37 @@ # Return the maximum values for each feature. # @return [Numo::Int32] (shape: [n_features]) attr_reader :n_values + # Return the indices for feature values that actually occur in the training set. + # @return [Nimo::Int32] + attr_reader :active_features + # Return the indices to feature ranges. # @return [Numo::Int32] (shape: [n_features + 1]) attr_reader :feature_indices # Create a new encoder for encoding categorical integer features to one-hot-vectors def initialize @params = {} @n_values = nil + @active_features = nil @feature_indices = nil end # Fit one-hot-encoder to samples. # # @overload fit(x) -> OneHotEncoder - # - # @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to fit one-hot-encoder. + # @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to fit one-hot-encoder. # @return [OneHotEncoder] def fit(x, _y = nil) check_params_type(Numo::Int32, x: x) + raise ArgumentError, 'Expected the input samples only consists of non-negative integer values.' if x.lt(0).any? @n_values = x.max(0) + 1 @feature_indices = Numo::Int32.hstack([[0], @n_values]).cumsum + @active_features = encode(x, @feature_indices).sum(0).ne(0).where self end # Fit one-hot-encoder to samples, then encode samples into one-hot-vectors # @@ -57,42 +63,53 @@ # # @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to encode into one-hot-vectors. # @return [Numo::DFloat] The one-hot-vectors. def fit_transform(x, _y = nil) check_params_type(Numo::Int32, x: x) + raise ArgumentError, 'Expected the input samples only consists of non-negative integer values.' if x.lt(0).any? fit(x).transform(x) end # Encode samples into one-hot-vectors. # # @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to encode into one-hot-vectors. # @return [Numo::DFloat] The one-hot-vectors. def transform(x) check_params_type(Numo::Int32, x: x) - n_samples, n_features = x.shape - n_features = 1 if n_features.nil? - column_indices = (x + @feature_indices[0...-1]).flatten.to_a - row_indices = Numo::Int32.new(n_samples).seq.repeat(n_features).to_a - codes = Numo::DFloat.zeros(n_samples, @feature_indices[-1]) - row_indices.zip(column_indices).each { |r, c| codes[r, c] = 1.0 } - codes + raise ArgumentError, 'Expected the input samples only consists of non-negative integer values.' if x.lt(0).any? + codes = encode(x, @feature_indices) + codes[true, @active_features].dup end # Dump marshal data. # @return [Hash] The marshal data about OneHotEncoder. def marshal_dump { params: @params, n_values: @n_values, + active_features: @active_features, feature_indices: @feature_indices } end # Load marshal data. # @return [nil] def marshal_load(obj) @params = obj[:params] @n_values = obj[:n_values] + @active_features = obj[:active_features] @feature_indices = obj[:feature_indices] nil + end + + private + + def encode(x, indices) + n_samples, n_features = x.shape + n_features = 1 if n_features.nil? + col_indices = (x + indices[0...-1]).flatten.to_a + row_indices = Numo::Int32.new(n_samples).seq.repeat(n_features).to_a + codes = Numo::DFloat.zeros(n_samples, indices[-1]) + row_indices.zip(col_indices).each { |r, c| codes[r, c] = 1.0 } + codes end end end end