lib/dnn/core/optimizers.rb in ruby-dnn-0.13.4 vs lib/dnn/core/optimizers.rb in ruby-dnn-0.14.0

- old
+ new

@@ -8,17 +8,18 @@ def self.from_hash(hash) return nil unless hash optimizer_class = DNN.const_get(hash[:class]) optimizer = optimizer_class.allocate - raise DNN_Error.new("#{optimizer.class} is not an instance of #{self} class.") unless optimizer.is_a?(self) + raise DNN_Error, "#{optimizer.class} is not an instance of #{self} class." unless optimizer.is_a?(self) optimizer.load_hash(hash) optimizer end def self.load(dumped) opt = from_hash(dumped[:hash]) + return opt unless dumped[:status] dumped[:status].each do |key, state| state = state.clone opt.status[key] = state opt.instance_variable_set("@#{key}", state) end @@ -32,36 +33,38 @@ # Update layers has params. def update(layers) target_params = layers.select { |layer| layer.is_a?(Layers::HasParamLayer) && layer.trainable } .map { |layer| layer.get_params.values }.flatten.compact - .select { |param| param.grad } + .select(&:grad) clip_grads(target_params) if @clip_norm update_params(target_params) target_params.each do |param| param.grad = Xumo::SFloat[0] end end - def dump - { hash: to_hash, status: @status } + def dump(require_status = true) + status = require_status ? @status : nil + { hash: to_hash, status: status } end def to_hash(merge_hash = nil) hash = { class: self.class.name, clip_norm: @clip_norm } hash.merge!(merge_hash) if merge_hash hash end # Update params. private def update_params(params) - raise NotImplementedError.new("Class '#{self.class.name}' has implement method 'update_params'") + raise NotImplementedError, "Class '#{self.class.name}' has implement method 'update_params'" end private def clip_grads(params) - norm = Math.sqrt(params.reduce(0) { |total, param| total + (param.grad ** 2).sum }) + norm = Math.sqrt(params.reduce(0) { |total, param| total + (param.grad**2).sum }) return if norm <= @clip_norm + rate = @clip_norm / (norm + 1e-7) params.each do |param| param.grad *= rate end end @@ -69,11 +72,10 @@ def load_hash(hash) initialize(clip_norm: hash[:clip_norm]) end end - class SGD < Optimizer attr_accessor :lr attr_accessor :momentum # @param [Float] lr Learning rate. @@ -105,27 +107,25 @@ def load_hash(hash) initialize(hash[:lr], momentum: hash[:momentum], clip_norm: hash[:clip_norm]) end end - class Nesterov < SGD def initialize(lr = 0.01, momentum: 0.9, clip_norm: nil) super(lr, momentum: momentum, clip_norm: clip_norm) end private def update_params(params) params.each do |param| @v[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) amount = param.grad * @lr @v[param.name] = @v[param.name] * @momentum - amount - param.data = (param.data + @momentum ** 2 * @v[param.name]) - (1 + @momentum) * amount + param.data = (param.data + @momentum**2 * @v[param.name]) - (1 + @momentum) * amount end end end - class AdaGrad < Optimizer attr_accessor :lr attr_accessor :eps # @param [Float] lr Learning rate. @@ -139,11 +139,11 @@ end private def update_params(params) params.each do |param| @g[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) - @g[param.name] += param.grad ** 2 + @g[param.name] += param.grad**2 param.data -= (@lr / Xumo::NMath.sqrt(@g[param.name] + @eps)) * param.grad end end def to_hash @@ -153,11 +153,10 @@ def load_hash(hash) initialize(hash[:lr], eps: hash[:eps], clip_norm: hash[:clip_norm]) end end - class RMSProp < Optimizer attr_accessor :lr attr_accessor :alpha attr_accessor :eps @@ -178,21 +177,20 @@ end private def update_params(params) params.each do |param| @g[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) - @g[param.name] = @alpha * @g[param.name] + (1 - @alpha) * param.grad ** 2 + @g[param.name] = @alpha * @g[param.name] + (1 - @alpha) * param.grad**2 param.data -= (@lr / Xumo::NMath.sqrt(@g[param.name] + @eps)) * param.grad end end def load_hash(hash) initialize(hash[:lr], alpha: hash[:alpha], eps: hash[:eps], clip_norm: hash[:clip_norm]) end end - class AdaDelta < Optimizer attr_accessor :rho attr_accessor :eps # @param [Float] rho Moving average index of past slopes. @@ -212,23 +210,22 @@ private def update_params(params) params.each do |param| @h[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) @s[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) - @h[param.name] = @rho * @h[param.name] + (1 - @rho) * param.grad ** 2 + @h[param.name] = @rho * @h[param.name] + (1 - @rho) * param.grad**2 v = (Xumo::NMath.sqrt(@s[param.name] + @eps) / Xumo::NMath.sqrt(@h[param.name] + @eps)) * param.grad - @s[param.name] = @rho * @s[param.name] + (1 - @rho) * v ** 2 + @s[param.name] = @rho * @s[param.name] + (1 - @rho) * v**2 param.data -= v end end def load_hash(hash) initialize(rho: hash[:rho], eps: hash[:eps], clip_norm: hash[:clip_norm]) end end - class RMSPropGraves < Optimizer attr_accessor :lr attr_accessor :alpha attr_accessor :eps @@ -252,21 +249,20 @@ private def update_params(params) params.each do |param| @m[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) @v[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) @m[param.name] = @alpha * @m[param.name] + (1 - @alpha) * param.grad - @v[param.name] = @alpha * @v[param.name] + (1 - @alpha) * param.grad ** 2 - param.data -= (@lr / Xumo::NMath.sqrt(@v[param.name] - @m[param.name] ** 2 + @eps)) * param.grad + @v[param.name] = @alpha * @v[param.name] + (1 - @alpha) * param.grad**2 + param.data -= (@lr / Xumo::NMath.sqrt(@v[param.name] - @m[param.name]**2 + @eps)) * param.grad end end def load_hash(hash) initialize(hash[:lr], alpha: hash[:alpha], eps: hash[:eps], clip_norm: hash[:clip_norm]) end end - class Adam < Optimizer attr_accessor :alpha attr_accessor :beta1 attr_accessor :beta2 attr_accessor :eps @@ -298,16 +294,16 @@ } end private def update_params(params) @t += 1 - lr = @alpha * Math.sqrt(1 - @beta2 ** @t) / (1 - @beta1 ** @t) + lr = @alpha * Math.sqrt(1 - @beta2**@t) / (1 - @beta1**@t) params.each do |param| @m[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) @v[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) @m[param.name] += (1 - @beta1) * (param.grad - @m[param.name]) - @v[param.name] += (1 - @beta2) * (param.grad ** 2 - @v[param.name]) + @v[param.name] += (1 - @beta2) * (param.grad**2 - @v[param.name]) if @amsgrad @s[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) @s[param.name] = Xumo::SFloat.maximum(@s[param.name], @v[param.name]) param.data -= lr * @m[param.name] / Xumo::NMath.sqrt(@s[param.name] + @eps) else @@ -320,11 +316,10 @@ initialize(alpha: hash[:alpha], beta1: hash[:beta1], beta2: hash[:beta2], eps: hash[:eps], amsgrad: hash[:amsgrad], clip_norm: hash[:clip_norm]) end end - class AdaBound < Adam attr_accessor :final_lr attr_accessor :gamma # @param [Float] final_lr Final learning rate. @@ -342,18 +337,18 @@ } end private def update_params(params) @t += 1 - lr = @alpha * Math.sqrt(1 - @beta2 ** @t) / (1 - @beta1 ** @t) + lr = @alpha * Math.sqrt(1 - @beta2**@t) / (1 - @beta1**@t) final_lr = @final_lr * lr / @alpha lower_bound = final_lr * (1 - 1 / (@gamma * @t + 1)) upper_bound = final_lr * (1 + 1 / (@gamma * @t)) params.each do |param| @m[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) @v[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) @m[param.name] += (1 - @beta1) * (param.grad - @m[param.name]) - @v[param.name] += (1 - @beta2) * (param.grad ** 2 - @v[param.name]) + @v[param.name] += (1 - @beta2) * (param.grad**2 - @v[param.name]) if @amsgrad @s[param.name] ||= Xumo::SFloat.zeros(*param.data.shape) @s[param.name] = Xumo::SFloat.maximum(@s[param.name], @v[param.name]) param.data -= clip_lr(lr / (Xumo::NMath.sqrt(@s[param.name]) + @eps), lower_bound, upper_bound) * @m[param.name] else