lib/torch/optim/adamw.rb in torch-rb-0.4.2 vs lib/torch/optim/adamw.rb in torch-rb-0.5.0

- old
+ new

@@ -57,11 +57,11 @@ bias_correction1 = 1 - beta1 ** state[:step] bias_correction2 = 1 - beta2 ** state[:step] # Decay the first and second moment running average coefficient exp_avg.mul!(beta1).add!(grad, alpha: 1 - beta1) - exp_avg_sq.mul!(beta2).addcmul!(1 - beta2, grad, grad) + exp_avg_sq.mul!(beta2).addcmul!(grad, grad, value: 1 - beta2) if amsgrad # Maintains the maximum of all 2nd moment running avg. till now Torch.max(max_exp_avg_sq, exp_avg_sq, out: max_exp_avg_sq) # Use the max. for normalizing running avg. of gradient denom = (max_exp_avg_sq.sqrt / Math.sqrt(bias_correction2)).add!(group[:eps]) @@ -69,10 +69,10 @@ denom = (exp_avg_sq.sqrt / Math.sqrt(bias_correction2)).add!(group[:eps]) end step_size = group[:lr] / bias_correction1 - p.data.addcdiv!(-step_size, exp_avg, denom) + p.data.addcdiv!(exp_avg, denom, value: -step_size) end end loss end