lib/dnn/core/optimizers.rb in ruby-dnn-0.10.1 vs lib/dnn/core/optimizers.rb in ruby-dnn-0.10.2

- old
+ new

@@ -1,283 +1,283 @@ -module DNN - module Optimizers - - # Super class of all optimizer classes. - class Optimizer - # @return [Float] Return the Learning rate. - attr_accessor :learning_rate - - def initialize(learning_rate) - @learning_rate = learning_rate - end - - # Update layers has param. - def update(layers) - target_params = layers.select { |layer| layer.is_a?(HasParamLayer) && layer.trainable } - .map { |layer| layer.params.values }.flatten - .select { |param| param.grad } - target_params.each do |param| - update_param(param) - param.grad = 0 - end - end - - def to_hash(merge_hash = nil) - hash = {class: self.class.name, learning_rate: @learning_rate} - hash.merge!(merge_hash) if merge_hash - hash - end - - # Update param. - # Classes that inherit from this class must implement this method. - private def update_param(param) - raise NotImplementedError.new("Class '#{self.class.name}' has implement method 'update_param'") - end - end - - - class SGD < Optimizer - # @return [Float] Return the momentum coefficient. - attr_accessor :momentum - - def self.from_hash(hash) - self.new(hash[:learning_rate], momentum: hash[:momentum]) - end - - # @param [Float] learning_rate Learning rate. - # @param [Float] momentum momentum coefficient. - def initialize(learning_rate = 0.01, momentum: 0) - super(learning_rate) - @momentum = momentum - @v = {} - end - - def to_hash - super(momentum: @momentum) - end - - private def update_param(param) - amount = param.grad * @learning_rate - if @momentum > 0 - @v[param] ||= 0 - amount += @momentum * @v[param] - @v[param] = amount - end - param.data -= amount - end - end - - - class Nesterov < Optimizer - attr_accessor :momentum - - def self.from_hash(hash) - self.new(hash[:learning_rate], momentum: hash[:momentum]) - end - - # @param [Float] learning_rate Learning rate. - # @param [Float] momentum momentum coefficient. - def initialize(learning_rate = 0.01, momentum: 0.9) - super(learning_rate) - @momentum = momentum - @v = {} - end - - def to_hash - super(momentum: @momentum) - end - - private def update_param(param) - @v[param] ||= 0 - amount = param.grad * @learning_rate - @v[param] = @v[param] * @momentum - amount - param.data = (param.data + @momentum**2 * @v[param]) - (1 + @momentum) * amount - end - end - - - class AdaGrad < Optimizer - # @return [Float] Return the eps value. - attr_accessor :eps - - # @param [Float] learning_rate Learning rate. - # @param [Float] eps Value to avoid division by zero. - def initialize(learning_rate = 0.01, eps: 1e-7) - super(learning_rate) - @eps = eps - @g = {} - end - - def self.from_hash(hash) - self.new(hash[:learning_rate], eps: hash[:eps]) - end - - private def update_param(param) - @g[param] ||= 0 - @g[param] += param.grad**2 - param.data -= (@learning_rate / NMath.sqrt(@g[param] + @eps)) * param.grad - end - - def to_hash - super(eps: @eps) - end - end - - - class RMSProp < Optimizer - # @return [Float] Return the alpha value. - attr_accessor :alpha - # @return [Float] Return the eps value. - attr_accessor :eps - - def self.from_hash(hash) - self.new(hash[:learning_rate], alpha: hash[:alpha], eps: hash[:eps]) - end - - # @param [Float] learning_rate Learning rate. - # @param [Float] alpha Moving average index of past slopes. - # @param [Float] eps Value to avoid division by zero. - def initialize(learning_rate = 0.001, alpha: 0.9, eps: 1e-7) - super(learning_rate) - @alpha = alpha - @eps = eps - @g = {} - end - - def to_hash - super(alpha: @alpha, eps: @eps) - end - - private def update_param(param) - @g[param] ||= 0 - @g[param] = @alpha * @g[param] + (1 - @alpha) * param.grad**2 - param.data -= (@learning_rate / NMath.sqrt(@g[param] + @eps)) * param.grad - end - end - - - class AdaDelta < Optimizer - # @return [Float] Return the rho value. - attr_accessor :rho - # @return [Float] Return the eps value. - attr_accessor :eps - - def self.from_hash(hash) - self.new(rho: hash[:rho], eps: hash[:eps]) - end - - # @param [Float] rho Moving average index of past slopes. - # @param [Float] eps Value to avoid division by zero. - def initialize(rho: 0.95, eps: 1e-6) - super(nil) - @rho = rho - @eps = eps - @h = {} - @s = {} - end - - def to_hash - super(rho: @rho, eps: @eps) - end - - private def update_param(param) - @h[param] ||= Xumo::SFloat.zeros(*param.data.shape) - @s[param] ||= Xumo::SFloat.zeros(*param.data.shape) - @h[param] = @rho * @h[param] + (1 - @rho) * param.grad**2 - v = (NMath.sqrt(@s[param] + @eps) / NMath.sqrt(@h[param] + @eps)) * param.grad - @s[param] = @rho * @s[param] + (1 - @rho) * v**2 - param.data -= v - end - end - - - class Adam < Optimizer - # @return [Float] Return the alpha value. - attr_accessor :alpha - # @return [Float] Return the beta1 value. - attr_accessor :beta1 - # @return [Float] Return the beta2 value. - attr_accessor :beta2 - # @return [Float] Return the eps value. - attr_accessor :eps - - def self.from_hash(hash) - self.new(alpha: hash[:alpha], beta1: hash[:beta1], beta2: hash[:beta2], eps: hash[:eps]) - end - - # @param [Float] alpha Value used to calculate learning rate. - # @param [Float] beta1 Moving average index of beta1. - # @param [Float] beta2 Moving average index of beta2. - # @param [Float] eps Value to avoid division by zero. - def initialize(alpha: 0.001, beta1: 0.9, beta2: 0.999, eps: 1e-7) - super(nil) - @alpha = alpha - @beta1 = beta1 - @beta2 = beta2 - @eps = eps - @iter = 0 - @m = {} - @v = {} - end - - def update(layers) - @iter += 1 - learning_rate = @alpha * Math.sqrt(1 - @beta2**@iter) / (1 - @beta1**@iter) - target_params = layers.select { |layer| layer.is_a?(HasParamLayer) && layer.trainable } - .map { |layer| layer.params.values }.flatten - .select { |param| param.grad } - target_params.each do |param| - update_param(param, learning_rate) - param.grad = 0 - end - end - - def to_hash - super(alpha: @alpha, beta1: @beta1, beta2: @beta2, eps: @eps) - end - - private def update_param(param, learning_rate) - @m[param] ||= 0 - @v[param] ||= 0 - @m[param] += (1 - @beta1) * (param.grad - @m[param]) - @v[param] += (1 - @beta2) * (param.grad**2 - @v[param]) - param.data -= learning_rate * @m[param] / NMath.sqrt(@v[param] + @eps) - end - end - - - class RMSPropGraves < Optimizer - # @return [Float] Return the alpha value. - attr_accessor :alpha - # @return [Float] Return the eps value. - attr_accessor :eps - - def self.from_hash(hash) - self.new(hash[:learning_rate], alpha: hash[:alpha], eps: hash[:eps]) - end - - # @param [Float] learning_rate Learning rate. - # @param [Float] alpha Moving average index of past slopes. - # @param [Float] eps Value to avoid division by zero. - def initialize(learning_rate = 0.0001, alpha: 0.95, eps: 0.0001) - super(learning_rate) - @alpha = alpha - @eps = eps - @m = {} - @v = {} - end - - def to_hash - super(alpha: @alpha, eps: @eps) - end - - private def update_param(param) - @m[param] ||= 0 - @v[param] ||= 0 - @m[param] = @alpha * @m[param] + (1 - @alpha) * param.grad - @v[param] = @alpha * @v[param] + (1 - @alpha) * param.grad**2 - param.data -= (@learning_rate / NMath.sqrt(@v[param] - @m[param]**2 + @eps)) * param.grad - end - end - - end -end +module DNN + module Optimizers + + # Super class of all optimizer classes. + class Optimizer + # @return [Float] Return the Learning rate. + attr_accessor :learning_rate + + def initialize(learning_rate) + @learning_rate = learning_rate + end + + # Update layers has param. + def update(layers) + target_params = layers.select { |layer| layer.is_a?(HasParamLayer) && layer.trainable } + .map { |layer| layer.params.values }.flatten + .select { |param| param.grad } + target_params.each do |param| + update_param(param) + param.grad = 0 + end + end + + def to_hash(merge_hash = nil) + hash = {class: self.class.name, learning_rate: @learning_rate} + hash.merge!(merge_hash) if merge_hash + hash + end + + # Update param. + # Classes that inherit from this class must implement this method. + private def update_param(param) + raise NotImplementedError.new("Class '#{self.class.name}' has implement method 'update_param'") + end + end + + + class SGD < Optimizer + # @return [Float] Return the momentum coefficient. + attr_accessor :momentum + + def self.from_hash(hash) + self.new(hash[:learning_rate], momentum: hash[:momentum]) + end + + # @param [Float] learning_rate Learning rate. + # @param [Float] momentum momentum coefficient. + def initialize(learning_rate = 0.01, momentum: 0) + super(learning_rate) + @momentum = momentum + @v = {} + end + + def to_hash + super(momentum: @momentum) + end + + private def update_param(param) + amount = param.grad * @learning_rate + if @momentum > 0 + @v[param] ||= 0 + amount += @momentum * @v[param] + @v[param] = amount + end + param.data -= amount + end + end + + + class Nesterov < Optimizer + attr_accessor :momentum + + def self.from_hash(hash) + self.new(hash[:learning_rate], momentum: hash[:momentum]) + end + + # @param [Float] learning_rate Learning rate. + # @param [Float] momentum momentum coefficient. + def initialize(learning_rate = 0.01, momentum: 0.9) + super(learning_rate) + @momentum = momentum + @v = {} + end + + def to_hash + super(momentum: @momentum) + end + + private def update_param(param) + @v[param] ||= 0 + amount = param.grad * @learning_rate + @v[param] = @v[param] * @momentum - amount + param.data = (param.data + @momentum**2 * @v[param]) - (1 + @momentum) * amount + end + end + + + class AdaGrad < Optimizer + # @return [Float] Return the eps value. + attr_accessor :eps + + # @param [Float] learning_rate Learning rate. + # @param [Float] eps Value to avoid division by zero. + def initialize(learning_rate = 0.01, eps: 1e-7) + super(learning_rate) + @eps = eps + @g = {} + end + + def self.from_hash(hash) + self.new(hash[:learning_rate], eps: hash[:eps]) + end + + private def update_param(param) + @g[param] ||= 0 + @g[param] += param.grad**2 + param.data -= (@learning_rate / NMath.sqrt(@g[param] + @eps)) * param.grad + end + + def to_hash + super(eps: @eps) + end + end + + + class RMSProp < Optimizer + # @return [Float] Return the alpha value. + attr_accessor :alpha + # @return [Float] Return the eps value. + attr_accessor :eps + + def self.from_hash(hash) + self.new(hash[:learning_rate], alpha: hash[:alpha], eps: hash[:eps]) + end + + # @param [Float] learning_rate Learning rate. + # @param [Float] alpha Moving average index of past slopes. + # @param [Float] eps Value to avoid division by zero. + def initialize(learning_rate = 0.001, alpha: 0.9, eps: 1e-7) + super(learning_rate) + @alpha = alpha + @eps = eps + @g = {} + end + + def to_hash + super(alpha: @alpha, eps: @eps) + end + + private def update_param(param) + @g[param] ||= 0 + @g[param] = @alpha * @g[param] + (1 - @alpha) * param.grad**2 + param.data -= (@learning_rate / NMath.sqrt(@g[param] + @eps)) * param.grad + end + end + + + class AdaDelta < Optimizer + # @return [Float] Return the rho value. + attr_accessor :rho + # @return [Float] Return the eps value. + attr_accessor :eps + + def self.from_hash(hash) + self.new(rho: hash[:rho], eps: hash[:eps]) + end + + # @param [Float] rho Moving average index of past slopes. + # @param [Float] eps Value to avoid division by zero. + def initialize(rho: 0.95, eps: 1e-6) + super(nil) + @rho = rho + @eps = eps + @h = {} + @s = {} + end + + def to_hash + super(rho: @rho, eps: @eps) + end + + private def update_param(param) + @h[param] ||= Xumo::SFloat.zeros(*param.data.shape) + @s[param] ||= Xumo::SFloat.zeros(*param.data.shape) + @h[param] = @rho * @h[param] + (1 - @rho) * param.grad**2 + v = (NMath.sqrt(@s[param] + @eps) / NMath.sqrt(@h[param] + @eps)) * param.grad + @s[param] = @rho * @s[param] + (1 - @rho) * v**2 + param.data -= v + end + end + + + class Adam < Optimizer + # @return [Float] Return the alpha value. + attr_accessor :alpha + # @return [Float] Return the beta1 value. + attr_accessor :beta1 + # @return [Float] Return the beta2 value. + attr_accessor :beta2 + # @return [Float] Return the eps value. + attr_accessor :eps + + def self.from_hash(hash) + self.new(alpha: hash[:alpha], beta1: hash[:beta1], beta2: hash[:beta2], eps: hash[:eps]) + end + + # @param [Float] alpha Value used to calculate learning rate. + # @param [Float] beta1 Moving average index of beta1. + # @param [Float] beta2 Moving average index of beta2. + # @param [Float] eps Value to avoid division by zero. + def initialize(alpha: 0.001, beta1: 0.9, beta2: 0.999, eps: 1e-7) + super(nil) + @alpha = alpha + @beta1 = beta1 + @beta2 = beta2 + @eps = eps + @iter = 0 + @m = {} + @v = {} + end + + def update(layers) + @iter += 1 + learning_rate = @alpha * Math.sqrt(1 - @beta2**@iter) / (1 - @beta1**@iter) + target_params = layers.select { |layer| layer.is_a?(HasParamLayer) && layer.trainable } + .map { |layer| layer.params.values }.flatten + .select { |param| param.grad } + target_params.each do |param| + update_param(param, learning_rate) + param.grad = 0 + end + end + + def to_hash + super(alpha: @alpha, beta1: @beta1, beta2: @beta2, eps: @eps) + end + + private def update_param(param, learning_rate) + @m[param] ||= 0 + @v[param] ||= 0 + @m[param] += (1 - @beta1) * (param.grad - @m[param]) + @v[param] += (1 - @beta2) * (param.grad**2 - @v[param]) + param.data -= learning_rate * @m[param] / NMath.sqrt(@v[param] + @eps) + end + end + + + class RMSPropGraves < Optimizer + # @return [Float] Return the alpha value. + attr_accessor :alpha + # @return [Float] Return the eps value. + attr_accessor :eps + + def self.from_hash(hash) + self.new(hash[:learning_rate], alpha: hash[:alpha], eps: hash[:eps]) + end + + # @param [Float] learning_rate Learning rate. + # @param [Float] alpha Moving average index of past slopes. + # @param [Float] eps Value to avoid division by zero. + def initialize(learning_rate = 0.0001, alpha: 0.95, eps: 0.0001) + super(learning_rate) + @alpha = alpha + @eps = eps + @m = {} + @v = {} + end + + def to_hash + super(alpha: @alpha, eps: @eps) + end + + private def update_param(param) + @m[param] ||= 0 + @v[param] ||= 0 + @m[param] = @alpha * @m[param] + (1 - @alpha) * param.grad + @v[param] = @alpha * @v[param] + (1 - @alpha) * param.grad**2 + param.data -= (@learning_rate / NMath.sqrt(@v[param] - @m[param]**2 + @eps)) * param.grad + end + end + + end +end