lib/dnn/core/optimizers.rb in ruby-dnn-0.14.3 vs lib/dnn/core/optimizers.rb in ruby-dnn-0.15.0

- old
+ new

@@ -1,11 +1,10 @@ module DNN module Optimizers # Super class of all optimizer classes. class Optimizer - attr_reader :status attr_accessor :clip_norm def self.from_hash(hash) return nil unless hash optimizer_class = DNN.const_get(hash[:class]) @@ -13,43 +12,27 @@ raise DNN_Error, "#{optimizer.class} is not an instance of #{self} class." unless optimizer.is_a?(self) optimizer.load_hash(hash) optimizer end - def self.load(dumped) - opt = from_hash(dumped[:hash]) - return opt unless dumped[:status] - dumped[:status].each do |key, state| - state = state.clone - opt.status[key] = state - opt.instance_variable_set("@#{key}", state) - end - opt - end - # @param [Float | NilClass] clip_norm Gradient clip norm. def initialize(clip_norm: nil) @clip_norm = clip_norm end # Update layers has params. def update(layers) - target_params = layers.select { |layer| layer.is_a?(Layers::HasParamLayer) && layer.trainable } + target_params = layers.select { |layer| layer.is_a?(Layers::TrainableLayer) && layer.trainable } .map { |layer| layer.get_params.values }.flatten.compact .select(&:grad) clip_grads(target_params) if @clip_norm update_params(target_params) target_params.each do |param| param.grad = Xumo::SFloat[0] end end - def dump(require_status = true) - status = require_status ? @status : nil - { hash: to_hash, status: status } - end - def to_hash(merge_hash = nil) hash = { class: self.class.name, clip_norm: @clip_norm } hash.merge!(merge_hash) if merge_hash hash end @@ -78,82 +61,80 @@ attr_accessor :lr attr_accessor :momentum # @param [Float] lr Learning rate. # @param [Float] momentum Momentum coefficient. - def initialize(lr = 0.01, momentum: 0, clip_norm: nil) + def initialize(lr: 0.01, momentum: 0, clip_norm: nil) super(clip_norm: clip_norm) @lr = lr @momentum = momentum @v = {} - @status = { v: @v } end def to_hash super(lr: @lr, momentum: @momentum) end private def update_params(params) params.each do |param| amount = param.grad * @lr if @momentum > 0 - @v[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) - amount += @momentum * @v[param.name] - @v[param.name] = amount + @v[param] ||= Xumo::SFloat.zeros(*param.data.shape) + amount += @momentum * @v[param] + @v[param] = amount end param.data -= amount end end def load_hash(hash) - initialize(hash[:lr], momentum: hash[:momentum], clip_norm: hash[:clip_norm]) + initialize(lr: hash[:lr], momentum: hash[:momentum], clip_norm: hash[:clip_norm]) end end class Nesterov < SGD - def initialize(lr = 0.01, momentum: 0.9, clip_norm: nil) - super(lr, momentum: momentum, clip_norm: clip_norm) + def initialize(lr: 0.01, momentum: 0.9, clip_norm: nil) + super(lr: lr, momentum: momentum, clip_norm: clip_norm) end private def update_params(params) params.each do |param| - @v[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) + @v[param] ||= Xumo::SFloat.zeros(*param.data.shape) amount = param.grad * @lr - @v[param.name] = @v[param.name] * @momentum - amount - param.data = (param.data + @momentum**2 * @v[param.name]) - (1 + @momentum) * amount + @v[param] = @v[param] * @momentum - amount + param.data = (param.data + @momentum**2 * @v[param]) - (1 + @momentum) * amount end end end class AdaGrad < Optimizer attr_accessor :lr attr_accessor :eps # @param [Float] lr Learning rate. # @param [Float] eps Value to avoid division by zero. - def initialize(lr = 0.01, eps: 1e-7, clip_norm: nil) + def initialize(lr: 0.01, eps: 1e-7, clip_norm: nil) super(clip_norm: clip_norm) @lr = lr @eps = eps @g = {} - @status = { g: @g } end private def update_params(params) params.each do |param| - @g[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) - @g[param.name] += param.grad**2 - param.data -= (@lr / Xumo::NMath.sqrt(@g[param.name] + @eps)) * param.grad + @g[param] ||= Xumo::SFloat.zeros(*param.data.shape) + @g[param] += param.grad**2 + param.data -= (@lr / Xumo::NMath.sqrt(@g[param] + @eps)) * param.grad end end def to_hash super(lr: @lr, eps: @eps) end def load_hash(hash) - initialize(hash[:lr], eps: hash[:eps], clip_norm: hash[:clip_norm]) + initialize(lr: hash[:lr], eps: hash[:eps], clip_norm: hash[:clip_norm]) end end class RMSProp < Optimizer attr_accessor :lr @@ -161,33 +142,32 @@ attr_accessor :eps # @param [Float] lr Learning rate. # @param [Float] alpha Moving average index of past slopes. # @param [Float] eps Value to avoid division by zero. - def initialize(lr = 0.001, alpha: 0.9, eps: 1e-7, clip_norm: nil) + def initialize(lr: 0.001, alpha: 0.9, eps: 1e-7, clip_norm: nil) super(clip_norm: clip_norm) @lr = lr @alpha = alpha @eps = eps @g = {} - @status = { g: @g } end def to_hash super(lr: @lr, alpha: @alpha, eps: @eps) end private def update_params(params) params.each do |param| - @g[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) - @g[param.name] = @alpha * @g[param.name] + (1 - @alpha) * param.grad**2 - param.data -= (@lr / Xumo::NMath.sqrt(@g[param.name] + @eps)) * param.grad + @g[param] ||= Xumo::SFloat.zeros(*param.data.shape) + @g[param] = @alpha * @g[param] + (1 - @alpha) * param.grad**2 + param.data -= (@lr / Xumo::NMath.sqrt(@g[param] + @eps)) * param.grad end end def load_hash(hash) - initialize(hash[:lr], alpha: hash[:alpha], eps: hash[:eps], clip_norm: hash[:clip_norm]) + initialize(lr: hash[:lr], alpha: hash[:alpha], eps: hash[:eps], clip_norm: hash[:clip_norm]) end end class AdaDelta < Optimizer attr_accessor :rho @@ -199,24 +179,23 @@ super(clip_norm: clip_norm) @rho = rho @eps = eps @h = {} @s = {} - @status = { h: @h, s: @s } end def to_hash super(rho: @rho, eps: @eps) end private def update_params(params) params.each do |param| - @h[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) - @s[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) - @h[param.name] = @rho * @h[param.name] + (1 - @rho) * param.grad**2 - v = (Xumo::NMath.sqrt(@s[param.name] + @eps) / Xumo::NMath.sqrt(@h[param.name] + @eps)) * param.grad - @s[param.name] = @rho * @s[param.name] + (1 - @rho) * v**2 + @h[param] ||= Xumo::SFloat.zeros(*param.data.shape) + @s[param] ||= Xumo::SFloat.zeros(*param.data.shape) + @h[param] = @rho * @h[param] + (1 - @rho) * param.grad**2 + v = (Xumo::NMath.sqrt(@s[param] + @eps) / Xumo::NMath.sqrt(@h[param] + @eps)) * param.grad + @s[param] = @rho * @s[param] + (1 - @rho) * v**2 param.data -= v end end def load_hash(hash) @@ -230,36 +209,35 @@ attr_accessor :eps # @param [Float] lr Learning rate. # @param [Float] alpha Moving average index of past slopes. # @param [Float] eps Value to avoid division by zero. - def initialize(lr = 0.0001, alpha: 0.95, eps: 0.0001, clip_norm: nil) + def initialize(lr: 0.0001, alpha: 0.95, eps: 0.0001, clip_norm: nil) super(clip_norm: clip_norm) @lr = lr @alpha = alpha @eps = eps @m = {} @v = {} - @status = { m: @m, v: @v } end def to_hash super(lr: @lr, alpha: @alpha, eps: @eps) end private def update_params(params) params.each do |param| - @m[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) - @v[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) - @m[param.name] = @alpha * @m[param.name] + (1 - @alpha) * param.grad - @v[param.name] = @alpha * @v[param.name] + (1 - @alpha) * param.grad**2 - param.data -= (@lr / Xumo::NMath.sqrt(@v[param.name] - @m[param.name]**2 + @eps)) * param.grad + @m[param] ||= Xumo::SFloat.zeros(*param.data.shape) + @v[param] ||= Xumo::SFloat.zeros(*param.data.shape) + @m[param] = @alpha * @m[param] + (1 - @alpha) * param.grad + @v[param] = @alpha * @v[param] + (1 - @alpha) * param.grad**2 + param.data -= (@lr / Xumo::NMath.sqrt(@v[param] - @m[param]**2 + @eps)) * param.grad end end def load_hash(hash) - initialize(hash[:lr], alpha: hash[:alpha], eps: hash[:eps], clip_norm: hash[:clip_norm]) + initialize(lr: hash[:lr], alpha: hash[:alpha], eps: hash[:eps], clip_norm: hash[:clip_norm]) end end class Adam < Optimizer attr_accessor :alpha @@ -282,11 +260,10 @@ @amsgrad = amsgrad @t = 0 @m = {} @v = {} @s = amsgrad ? {} : nil - @status = { t: @t, m: @m, v: @v, s: @s } end def to_hash { class: self.class.name, alpha: @alpha, beta1: @beta1, beta2: @beta2, @@ -296,20 +273,20 @@ private def update_params(params) @t += 1 lr = @alpha * Math.sqrt(1 - @beta2**@t) / (1 - @beta1**@t) params.each do |param| - @m[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) - @v[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) - @m[param.name] += (1 - @beta1) * (param.grad - @m[param.name]) - @v[param.name] += (1 - @beta2) * (param.grad**2 - @v[param.name]) + @m[param] ||= Xumo::SFloat.zeros(*param.data.shape) + @v[param] ||= Xumo::SFloat.zeros(*param.data.shape) + @m[param] += (1 - @beta1) * (param.grad - @m[param]) + @v[param] += (1 - @beta2) * (param.grad**2 - @v[param]) if @amsgrad - @s[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) - @s[param.name] = Xumo::SFloat.maximum(@s[param.name], @v[param.name]) - param.data -= lr * @m[param.name] / Xumo::NMath.sqrt(@s[param.name] + @eps) + @s[param] ||= Xumo::SFloat.zeros(*param.data.shape) + @s[param] = Xumo::SFloat.maximum(@s[param], @v[param]) + param.data -= lr * @m[param] / Xumo::NMath.sqrt(@s[param] + @eps) else - param.data -= lr * @m[param.name] / Xumo::NMath.sqrt(@v[param.name] + @eps) + param.data -= lr * @m[param] / Xumo::NMath.sqrt(@v[param] + @eps) end end end def load_hash(hash) @@ -342,19 +319,19 @@ lr = @alpha * Math.sqrt(1 - @beta2**@t) / (1 - @beta1**@t) final_lr = @final_lr * lr / @alpha lower_bound = final_lr * (1 - 1 / (@gamma * @t + 1)) upper_bound = final_lr * (1 + 1 / (@gamma * @t)) params.each do |param| - @m[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) - @v[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) - @m[param.name] += (1 - @beta1) * (param.grad - @m[param.name]) - @v[param.name] += (1 - @beta2) * (param.grad**2 - @v[param.name]) + @m[param] ||= Xumo::SFloat.zeros(*param.data.shape) + @v[param] ||= Xumo::SFloat.zeros(*param.data.shape) + @m[param] += (1 - @beta1) * (param.grad - @m[param]) + @v[param] += (1 - @beta2) * (param.grad**2 - @v[param]) if @amsgrad - @s[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) - @s[param.name] = Xumo::SFloat.maximum(@s[param.name], @v[param.name]) - param.data -= clip_lr(lr / (Xumo::NMath.sqrt(@s[param.name]) + @eps), lower_bound, upper_bound) * @m[param.name] + @s[param] ||= Xumo::SFloat.zeros(*param.data.shape) + @s[param] = Xumo::SFloat.maximum(@s[param], @v[param]) + param.data -= clip_lr(lr / (Xumo::NMath.sqrt(@s[param]) + @eps), lower_bound, upper_bound) * @m[param] else - param.data -= clip_lr(lr / (Xumo::NMath.sqrt(@v[param.name]) + @eps), lower_bound, upper_bound) * @m[param.name] + param.data -= clip_lr(lr / (Xumo::NMath.sqrt(@v[param]) + @eps), lower_bound, upper_bound) * @m[param] end end end private def clip_lr(lr, lower_bound, upper_bound)