lib/rumale/preprocessing/one_hot_encoder.rb in rumale-0.13.1 vs lib/rumale/preprocessing/one_hot_encoder.rb in rumale-0.13.2
- old
+ new
@@ -25,31 +25,37 @@
# Return the maximum values for each feature.
# @return [Numo::Int32] (shape: [n_features])
attr_reader :n_values
+ # Return the indices for feature values that actually occur in the training set.
+ # @return [Nimo::Int32]
+ attr_reader :active_features
+
# Return the indices to feature ranges.
# @return [Numo::Int32] (shape: [n_features + 1])
attr_reader :feature_indices
# Create a new encoder for encoding categorical integer features to one-hot-vectors
def initialize
@params = {}
@n_values = nil
+ @active_features = nil
@feature_indices = nil
end
# Fit one-hot-encoder to samples.
#
# @overload fit(x) -> OneHotEncoder
- #
- # @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to fit one-hot-encoder.
+ # @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to fit one-hot-encoder.
# @return [OneHotEncoder]
def fit(x, _y = nil)
check_params_type(Numo::Int32, x: x)
+ raise ArgumentError, 'Expected the input samples only consists of non-negative integer values.' if x.lt(0).any?
@n_values = x.max(0) + 1
@feature_indices = Numo::Int32.hstack([[0], @n_values]).cumsum
+ @active_features = encode(x, @feature_indices).sum(0).ne(0).where
self
end
# Fit one-hot-encoder to samples, then encode samples into one-hot-vectors
#
@@ -57,42 +63,53 @@
#
# @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to encode into one-hot-vectors.
# @return [Numo::DFloat] The one-hot-vectors.
def fit_transform(x, _y = nil)
check_params_type(Numo::Int32, x: x)
+ raise ArgumentError, 'Expected the input samples only consists of non-negative integer values.' if x.lt(0).any?
fit(x).transform(x)
end
# Encode samples into one-hot-vectors.
#
# @param x [Numo::Int32] (shape: [n_samples, n_features]) The samples to encode into one-hot-vectors.
# @return [Numo::DFloat] The one-hot-vectors.
def transform(x)
check_params_type(Numo::Int32, x: x)
- n_samples, n_features = x.shape
- n_features = 1 if n_features.nil?
- column_indices = (x + @feature_indices[0...-1]).flatten.to_a
- row_indices = Numo::Int32.new(n_samples).seq.repeat(n_features).to_a
- codes = Numo::DFloat.zeros(n_samples, @feature_indices[-1])
- row_indices.zip(column_indices).each { |r, c| codes[r, c] = 1.0 }
- codes
+ raise ArgumentError, 'Expected the input samples only consists of non-negative integer values.' if x.lt(0).any?
+ codes = encode(x, @feature_indices)
+ codes[true, @active_features].dup
end
# Dump marshal data.
# @return [Hash] The marshal data about OneHotEncoder.
def marshal_dump
{ params: @params,
n_values: @n_values,
+ active_features: @active_features,
feature_indices: @feature_indices }
end
# Load marshal data.
# @return [nil]
def marshal_load(obj)
@params = obj[:params]
@n_values = obj[:n_values]
+ @active_features = obj[:active_features]
@feature_indices = obj[:feature_indices]
nil
+ end
+
+ private
+
+ def encode(x, indices)
+ n_samples, n_features = x.shape
+ n_features = 1 if n_features.nil?
+ col_indices = (x + indices[0...-1]).flatten.to_a
+ row_indices = Numo::Int32.new(n_samples).seq.repeat(n_features).to_a
+ codes = Numo::DFloat.zeros(n_samples, indices[-1])
+ row_indices.zip(col_indices).each { |r, c| codes[r, c] = 1.0 }
+ codes
end
end
end
end