lib/torch/optim/adam.rb in torch-rb-0.4.2 vs lib/torch/optim/adam.rb in torch-rb-0.5.0

- old
+ new

@@ -56,11 +56,11 @@ grad.add!(p.data, alpha: group[:weight_decay]) end # Decay the first and second moment running average coefficient exp_avg.mul!(beta1).add!(grad, alpha: 1 - beta1) - exp_avg_sq.mul!(beta2).addcmul!(1 - beta2, grad, grad) + exp_avg_sq.mul!(beta2).addcmul!(grad, grad, value: 1 - beta2) if amsgrad # Maintains the maximum of all 2nd moment running avg. till now Torch.max(max_exp_avg_sq, exp_avg_sq, out: max_exp_avg_sq) # Use the max. for normalizing running avg. of gradient denom = (max_exp_avg_sq.sqrt / Math.sqrt(bias_correction2)).add!(group[:eps]) @@ -68,10 +68,10 @@ denom = (exp_avg_sq.sqrt / Math.sqrt(bias_correction2)).add!(group[:eps]) end step_size = group[:lr] / bias_correction1 - p.data.addcdiv!(-step_size, exp_avg, denom) + p.data.addcdiv!(exp_avg, denom, value: -step_size) end end loss end