lib/dnn/core/optimizers.rb in ruby-dnn-0.7.3 vs lib/dnn/core/optimizers.rb in ruby-dnn-0.8.0

- old
+ new

@@ -7,12 +7,13 @@ def initialize(learning_rate) @learning_rate = learning_rate end - # Update layer has params. - def update(layer) end + # Update params. + # Classes that inherit from this class must implement this method. + # def update(params) end def to_hash(merge_hash = nil) hash = {class: self.class.name, learning_rate: @learning_rate} hash.merge!(merge_hash) if merge_hash hash @@ -31,20 +32,19 @@ super(learning_rate) @momentum = momentum @v = {} end - def update(layer) - @v[layer] ||= {} - layer.params.each_key do |key| - amount = layer.grads[key] * @learning_rate + def update(params) + params.select { |key, param| param.is_a?(LearningParam) }.each_value do |param| + amount = param.grad * @learning_rate if @momentum > 0 - @v[layer][key] ||= 0 - amount += @momentum * @v[layer][key] - @v[layer][key] = amount + @v[param] ||= 0 + amount += @momentum * @v[param] + @v[param] = amount end - layer.params[key] -= amount + param.data -= amount end end def to_hash super({momentum: @momentum}) @@ -59,17 +59,16 @@ def initialize(learning_rate = 0.01, momentum: 0.9) super(learning_rate, momentum: momentum) end - def update(layer) - @v[layer] ||= {} - layer.params.each_key do |key| - @v[layer][key] ||= 0 - amount = layer.grads[key] * @learning_rate - @v[layer][key] = @v[layer][key] * @momentum - amount - layer.params[key] = (layer.params[key] + @momentum**2 * @v[layer][key]) - (1 + @momentum) * amount + def update(params) + params.select { |key, param| param.is_a?(LearningParam) }.each_value do |param| + @v[param] ||= 0 + amount = param.grad * @learning_rate + @v[param] = @v[param] * @momentum - amount + param.data = (param.data + @momentum**2 * @v[param]) - (1 + @momentum) * amount end end end @@ -81,16 +80,15 @@ def self.load_hash(hash) self.new(hash[:learning_rate]) end - def update(layer) - @g[layer] ||= {} - layer.params.each_key do |key| - @g[layer][key] ||= 0 - @g[layer][key] += layer.grads[key]**2 - layer.params[key] -= (@learning_rate / Xumo::NMath.sqrt(@g[layer][key] + 1e-7)) * layer.grads[key] + def update(params) + params.select { |key, param| param.is_a?(LearningParam) }.each_value do |param| + @g[param] ||= 0 + @g[param] += param.grad**2 + param.data -= (@learning_rate / Xumo::NMath.sqrt(@g[param] + 1e-7)) * param.grad end end end @@ -105,16 +103,15 @@ super(learning_rate) @alpha = alpha @g = {} end - def update(layer) - @g[layer] ||= {} - layer.params.each_key do |key| - @g[layer][key] ||= 0 - @g[layer][key] = @alpha * @g[layer][key] + (1 - @alpha) * layer.grads[key]**2 - layer.params[key] -= (@learning_rate / Xumo::NMath.sqrt(@g[layer][key] + 1e-7)) * layer.grads[key] + def update(params) + params.select { |key, param| param.is_a?(LearningParam) }.each_value do |param| + @g[param] ||= 0 + @g[param] = @alpha * @g[param] + (1 - @alpha) * param.grad**2 + param.data -= (@learning_rate / Xumo::NMath.sqrt(@g[param] + 1e-7)) * param.grad end end def to_hash super({alpha: @alpha}) @@ -134,20 +131,18 @@ @rho = rho @h = {} @s = {} end - def update(layer) - @h[layer] ||= {} - @s[layer] ||= {} - layer.params.each_key do |key| - @h[layer][key] ||= Xumo::SFloat.zeros(*layer.params[key].shape) - @s[layer][key] ||= Xumo::SFloat.zeros(*layer.params[key].shape) - @h[layer][key] = @rho * @h[layer][key] + (1 - @rho) * layer.grads[key]**2 - v = (Xumo::NMath.sqrt(@s[layer][key] + 1e-6) / Xumo::NMath.sqrt(@h[layer][key] + 1e-6)) * layer.grads[key] - @s[layer][key] = @rho * @s[layer][key] + (1 - @rho) * v**2 - layer.params[key] -= v + def update(params) + params.select { |key, param| param.is_a?(LearningParam) }.each_value do |param| + @h[param] ||= Xumo::SFloat.zeros(*param.data.shape) + @s[param] ||= Xumo::SFloat.zeros(*param.data.shape) + @h[param] = @rho * @h[param] + (1 - @rho) * param.grad**2 + v = (Xumo::NMath.sqrt(@s[param] + 1e-6) / Xumo::NMath.sqrt(@h[param] + 1e-6)) * param.grad + @s[param] = @rho * @s[param] + (1 - @rho) * v**2 + param.data -= v end end def to_hash super({rho: @rho}) @@ -170,20 +165,18 @@ @iter = 0 @m = {} @v = {} end - def update(layer) + def update(params) @iter += 1 - @m[layer] ||= {} - @v[layer] ||= {} lr = @learning_rate * Math.sqrt(1 - @beta2**@iter) / (1 - @beta1**@iter) - layer.params.each_key do |key| - @m[layer][key] ||= 0 - @v[layer][key] ||= 0 - @m[layer][key] += (1 - @beta1) * (layer.grads[key] - @m[layer][key]) - @v[layer][key] += (1 - @beta2) * (layer.grads[key]**2 - @v[layer][key]) - layer.params[key] -= lr * @m[layer][key] / Xumo::NMath.sqrt(@v[layer][key] + 1e-7) + params.select { |key, param| param.is_a?(LearningParam) }.each_value do |param| + @m[param] ||= 0 + @v[param] ||= 0 + @m[param] += (1 - @beta1) * (param.grad - @m[param]) + @v[param] += (1 - @beta2) * (param.grad**2 - @v[param]) + param.data -= lr * @m[param] / Xumo::NMath.sqrt(@v[param] + 1e-7) end end def to_hash super({beta1: @beta1, beta2: @beta2})