lib/torch/optim/adam.rb in torch-rb-0.4.2 vs lib/torch/optim/adam.rb in torch-rb-0.5.0
- old
+ new
@@ -56,11 +56,11 @@
grad.add!(p.data, alpha: group[:weight_decay])
end
# Decay the first and second moment running average coefficient
exp_avg.mul!(beta1).add!(grad, alpha: 1 - beta1)
- exp_avg_sq.mul!(beta2).addcmul!(1 - beta2, grad, grad)
+ exp_avg_sq.mul!(beta2).addcmul!(grad, grad, value: 1 - beta2)
if amsgrad
# Maintains the maximum of all 2nd moment running avg. till now
Torch.max(max_exp_avg_sq, exp_avg_sq, out: max_exp_avg_sq)
# Use the max. for normalizing running avg. of gradient
denom = (max_exp_avg_sq.sqrt / Math.sqrt(bias_correction2)).add!(group[:eps])
@@ -68,10 +68,10 @@
denom = (exp_avg_sq.sqrt / Math.sqrt(bias_correction2)).add!(group[:eps])
end
step_size = group[:lr] / bias_correction1
- p.data.addcdiv!(-step_size, exp_avg, denom)
+ p.data.addcdiv!(exp_avg, denom, value: -step_size)
end
end
loss
end