lib/dnn/core/optimizers.rb in ruby-dnn-0.14.3 vs lib/dnn/core/optimizers.rb in ruby-dnn-0.15.0
- old
+ new
@@ -1,11 +1,10 @@
module DNN
module Optimizers
# Super class of all optimizer classes.
class Optimizer
- attr_reader :status
attr_accessor :clip_norm
def self.from_hash(hash)
return nil unless hash
optimizer_class = DNN.const_get(hash[:class])
@@ -13,43 +12,27 @@
raise DNN_Error, "#{optimizer.class} is not an instance of #{self} class." unless optimizer.is_a?(self)
optimizer.load_hash(hash)
optimizer
end
- def self.load(dumped)
- opt = from_hash(dumped[:hash])
- return opt unless dumped[:status]
- dumped[:status].each do |key, state|
- state = state.clone
- opt.status[key] = state
- opt.instance_variable_set("@#{key}", state)
- end
- opt
- end
-
# @param [Float | NilClass] clip_norm Gradient clip norm.
def initialize(clip_norm: nil)
@clip_norm = clip_norm
end
# Update layers has params.
def update(layers)
- target_params = layers.select { |layer| layer.is_a?(Layers::HasParamLayer) && layer.trainable }
+ target_params = layers.select { |layer| layer.is_a?(Layers::TrainableLayer) && layer.trainable }
.map { |layer| layer.get_params.values }.flatten.compact
.select(&:grad)
clip_grads(target_params) if @clip_norm
update_params(target_params)
target_params.each do |param|
param.grad = Xumo::SFloat[0]
end
end
- def dump(require_status = true)
- status = require_status ? @status : nil
- { hash: to_hash, status: status }
- end
-
def to_hash(merge_hash = nil)
hash = { class: self.class.name, clip_norm: @clip_norm }
hash.merge!(merge_hash) if merge_hash
hash
end
@@ -78,82 +61,80 @@
attr_accessor :lr
attr_accessor :momentum
# @param [Float] lr Learning rate.
# @param [Float] momentum Momentum coefficient.
- def initialize(lr = 0.01, momentum: 0, clip_norm: nil)
+ def initialize(lr: 0.01, momentum: 0, clip_norm: nil)
super(clip_norm: clip_norm)
@lr = lr
@momentum = momentum
@v = {}
- @status = { v: @v }
end
def to_hash
super(lr: @lr, momentum: @momentum)
end
private def update_params(params)
params.each do |param|
amount = param.grad * @lr
if @momentum > 0
- @v[param.name] ||= Xumo::SFloat.zeros(*param.data.shape)
- amount += @momentum * @v[param.name]
- @v[param.name] = amount
+ @v[param] ||= Xumo::SFloat.zeros(*param.data.shape)
+ amount += @momentum * @v[param]
+ @v[param] = amount
end
param.data -= amount
end
end
def load_hash(hash)
- initialize(hash[:lr], momentum: hash[:momentum], clip_norm: hash[:clip_norm])
+ initialize(lr: hash[:lr], momentum: hash[:momentum], clip_norm: hash[:clip_norm])
end
end
class Nesterov < SGD
- def initialize(lr = 0.01, momentum: 0.9, clip_norm: nil)
- super(lr, momentum: momentum, clip_norm: clip_norm)
+ def initialize(lr: 0.01, momentum: 0.9, clip_norm: nil)
+ super(lr: lr, momentum: momentum, clip_norm: clip_norm)
end
private def update_params(params)
params.each do |param|
- @v[param.name] ||= Xumo::SFloat.zeros(*param.data.shape)
+ @v[param] ||= Xumo::SFloat.zeros(*param.data.shape)
amount = param.grad * @lr
- @v[param.name] = @v[param.name] * @momentum - amount
- param.data = (param.data + @momentum**2 * @v[param.name]) - (1 + @momentum) * amount
+ @v[param] = @v[param] * @momentum - amount
+ param.data = (param.data + @momentum**2 * @v[param]) - (1 + @momentum) * amount
end
end
end
class AdaGrad < Optimizer
attr_accessor :lr
attr_accessor :eps
# @param [Float] lr Learning rate.
# @param [Float] eps Value to avoid division by zero.
- def initialize(lr = 0.01, eps: 1e-7, clip_norm: nil)
+ def initialize(lr: 0.01, eps: 1e-7, clip_norm: nil)
super(clip_norm: clip_norm)
@lr = lr
@eps = eps
@g = {}
- @status = { g: @g }
end
private def update_params(params)
params.each do |param|
- @g[param.name] ||= Xumo::SFloat.zeros(*param.data.shape)
- @g[param.name] += param.grad**2
- param.data -= (@lr / Xumo::NMath.sqrt(@g[param.name] + @eps)) * param.grad
+ @g[param] ||= Xumo::SFloat.zeros(*param.data.shape)
+ @g[param] += param.grad**2
+ param.data -= (@lr / Xumo::NMath.sqrt(@g[param] + @eps)) * param.grad
end
end
def to_hash
super(lr: @lr, eps: @eps)
end
def load_hash(hash)
- initialize(hash[:lr], eps: hash[:eps], clip_norm: hash[:clip_norm])
+ initialize(lr: hash[:lr], eps: hash[:eps], clip_norm: hash[:clip_norm])
end
end
class RMSProp < Optimizer
attr_accessor :lr
@@ -161,33 +142,32 @@
attr_accessor :eps
# @param [Float] lr Learning rate.
# @param [Float] alpha Moving average index of past slopes.
# @param [Float] eps Value to avoid division by zero.
- def initialize(lr = 0.001, alpha: 0.9, eps: 1e-7, clip_norm: nil)
+ def initialize(lr: 0.001, alpha: 0.9, eps: 1e-7, clip_norm: nil)
super(clip_norm: clip_norm)
@lr = lr
@alpha = alpha
@eps = eps
@g = {}
- @status = { g: @g }
end
def to_hash
super(lr: @lr, alpha: @alpha, eps: @eps)
end
private def update_params(params)
params.each do |param|
- @g[param.name] ||= Xumo::SFloat.zeros(*param.data.shape)
- @g[param.name] = @alpha * @g[param.name] + (1 - @alpha) * param.grad**2
- param.data -= (@lr / Xumo::NMath.sqrt(@g[param.name] + @eps)) * param.grad
+ @g[param] ||= Xumo::SFloat.zeros(*param.data.shape)
+ @g[param] = @alpha * @g[param] + (1 - @alpha) * param.grad**2
+ param.data -= (@lr / Xumo::NMath.sqrt(@g[param] + @eps)) * param.grad
end
end
def load_hash(hash)
- initialize(hash[:lr], alpha: hash[:alpha], eps: hash[:eps], clip_norm: hash[:clip_norm])
+ initialize(lr: hash[:lr], alpha: hash[:alpha], eps: hash[:eps], clip_norm: hash[:clip_norm])
end
end
class AdaDelta < Optimizer
attr_accessor :rho
@@ -199,24 +179,23 @@
super(clip_norm: clip_norm)
@rho = rho
@eps = eps
@h = {}
@s = {}
- @status = { h: @h, s: @s }
end
def to_hash
super(rho: @rho, eps: @eps)
end
private def update_params(params)
params.each do |param|
- @h[param.name] ||= Xumo::SFloat.zeros(*param.data.shape)
- @s[param.name] ||= Xumo::SFloat.zeros(*param.data.shape)
- @h[param.name] = @rho * @h[param.name] + (1 - @rho) * param.grad**2
- v = (Xumo::NMath.sqrt(@s[param.name] + @eps) / Xumo::NMath.sqrt(@h[param.name] + @eps)) * param.grad
- @s[param.name] = @rho * @s[param.name] + (1 - @rho) * v**2
+ @h[param] ||= Xumo::SFloat.zeros(*param.data.shape)
+ @s[param] ||= Xumo::SFloat.zeros(*param.data.shape)
+ @h[param] = @rho * @h[param] + (1 - @rho) * param.grad**2
+ v = (Xumo::NMath.sqrt(@s[param] + @eps) / Xumo::NMath.sqrt(@h[param] + @eps)) * param.grad
+ @s[param] = @rho * @s[param] + (1 - @rho) * v**2
param.data -= v
end
end
def load_hash(hash)
@@ -230,36 +209,35 @@
attr_accessor :eps
# @param [Float] lr Learning rate.
# @param [Float] alpha Moving average index of past slopes.
# @param [Float] eps Value to avoid division by zero.
- def initialize(lr = 0.0001, alpha: 0.95, eps: 0.0001, clip_norm: nil)
+ def initialize(lr: 0.0001, alpha: 0.95, eps: 0.0001, clip_norm: nil)
super(clip_norm: clip_norm)
@lr = lr
@alpha = alpha
@eps = eps
@m = {}
@v = {}
- @status = { m: @m, v: @v }
end
def to_hash
super(lr: @lr, alpha: @alpha, eps: @eps)
end
private def update_params(params)
params.each do |param|
- @m[param.name] ||= Xumo::SFloat.zeros(*param.data.shape)
- @v[param.name] ||= Xumo::SFloat.zeros(*param.data.shape)
- @m[param.name] = @alpha * @m[param.name] + (1 - @alpha) * param.grad
- @v[param.name] = @alpha * @v[param.name] + (1 - @alpha) * param.grad**2
- param.data -= (@lr / Xumo::NMath.sqrt(@v[param.name] - @m[param.name]**2 + @eps)) * param.grad
+ @m[param] ||= Xumo::SFloat.zeros(*param.data.shape)
+ @v[param] ||= Xumo::SFloat.zeros(*param.data.shape)
+ @m[param] = @alpha * @m[param] + (1 - @alpha) * param.grad
+ @v[param] = @alpha * @v[param] + (1 - @alpha) * param.grad**2
+ param.data -= (@lr / Xumo::NMath.sqrt(@v[param] - @m[param]**2 + @eps)) * param.grad
end
end
def load_hash(hash)
- initialize(hash[:lr], alpha: hash[:alpha], eps: hash[:eps], clip_norm: hash[:clip_norm])
+ initialize(lr: hash[:lr], alpha: hash[:alpha], eps: hash[:eps], clip_norm: hash[:clip_norm])
end
end
class Adam < Optimizer
attr_accessor :alpha
@@ -282,11 +260,10 @@
@amsgrad = amsgrad
@t = 0
@m = {}
@v = {}
@s = amsgrad ? {} : nil
- @status = { t: @t, m: @m, v: @v, s: @s }
end
def to_hash
{
class: self.class.name, alpha: @alpha, beta1: @beta1, beta2: @beta2,
@@ -296,20 +273,20 @@
private def update_params(params)
@t += 1
lr = @alpha * Math.sqrt(1 - @beta2**@t) / (1 - @beta1**@t)
params.each do |param|
- @m[param.name] ||= Xumo::SFloat.zeros(*param.data.shape)
- @v[param.name] ||= Xumo::SFloat.zeros(*param.data.shape)
- @m[param.name] += (1 - @beta1) * (param.grad - @m[param.name])
- @v[param.name] += (1 - @beta2) * (param.grad**2 - @v[param.name])
+ @m[param] ||= Xumo::SFloat.zeros(*param.data.shape)
+ @v[param] ||= Xumo::SFloat.zeros(*param.data.shape)
+ @m[param] += (1 - @beta1) * (param.grad - @m[param])
+ @v[param] += (1 - @beta2) * (param.grad**2 - @v[param])
if @amsgrad
- @s[param.name] ||= Xumo::SFloat.zeros(*param.data.shape)
- @s[param.name] = Xumo::SFloat.maximum(@s[param.name], @v[param.name])
- param.data -= lr * @m[param.name] / Xumo::NMath.sqrt(@s[param.name] + @eps)
+ @s[param] ||= Xumo::SFloat.zeros(*param.data.shape)
+ @s[param] = Xumo::SFloat.maximum(@s[param], @v[param])
+ param.data -= lr * @m[param] / Xumo::NMath.sqrt(@s[param] + @eps)
else
- param.data -= lr * @m[param.name] / Xumo::NMath.sqrt(@v[param.name] + @eps)
+ param.data -= lr * @m[param] / Xumo::NMath.sqrt(@v[param] + @eps)
end
end
end
def load_hash(hash)
@@ -342,19 +319,19 @@
lr = @alpha * Math.sqrt(1 - @beta2**@t) / (1 - @beta1**@t)
final_lr = @final_lr * lr / @alpha
lower_bound = final_lr * (1 - 1 / (@gamma * @t + 1))
upper_bound = final_lr * (1 + 1 / (@gamma * @t))
params.each do |param|
- @m[param.name] ||= Xumo::SFloat.zeros(*param.data.shape)
- @v[param.name] ||= Xumo::SFloat.zeros(*param.data.shape)
- @m[param.name] += (1 - @beta1) * (param.grad - @m[param.name])
- @v[param.name] += (1 - @beta2) * (param.grad**2 - @v[param.name])
+ @m[param] ||= Xumo::SFloat.zeros(*param.data.shape)
+ @v[param] ||= Xumo::SFloat.zeros(*param.data.shape)
+ @m[param] += (1 - @beta1) * (param.grad - @m[param])
+ @v[param] += (1 - @beta2) * (param.grad**2 - @v[param])
if @amsgrad
- @s[param.name] ||= Xumo::SFloat.zeros(*param.data.shape)
- @s[param.name] = Xumo::SFloat.maximum(@s[param.name], @v[param.name])
- param.data -= clip_lr(lr / (Xumo::NMath.sqrt(@s[param.name]) + @eps), lower_bound, upper_bound) * @m[param.name]
+ @s[param] ||= Xumo::SFloat.zeros(*param.data.shape)
+ @s[param] = Xumo::SFloat.maximum(@s[param], @v[param])
+ param.data -= clip_lr(lr / (Xumo::NMath.sqrt(@s[param]) + @eps), lower_bound, upper_bound) * @m[param]
else
- param.data -= clip_lr(lr / (Xumo::NMath.sqrt(@v[param.name]) + @eps), lower_bound, upper_bound) * @m[param.name]
+ param.data -= clip_lr(lr / (Xumo::NMath.sqrt(@v[param]) + @eps), lower_bound, upper_bound) * @m[param]
end
end
end
private def clip_lr(lr, lower_bound, upper_bound)