lib/torch/optim/adamw.rb in torch-rb-0.4.2 vs lib/torch/optim/adamw.rb in torch-rb-0.5.0
- old
+ new
@@ -57,11 +57,11 @@
bias_correction1 = 1 - beta1 ** state[:step]
bias_correction2 = 1 - beta2 ** state[:step]
# Decay the first and second moment running average coefficient
exp_avg.mul!(beta1).add!(grad, alpha: 1 - beta1)
- exp_avg_sq.mul!(beta2).addcmul!(1 - beta2, grad, grad)
+ exp_avg_sq.mul!(beta2).addcmul!(grad, grad, value: 1 - beta2)
if amsgrad
# Maintains the maximum of all 2nd moment running avg. till now
Torch.max(max_exp_avg_sq, exp_avg_sq, out: max_exp_avg_sq)
# Use the max. for normalizing running avg. of gradient
denom = (max_exp_avg_sq.sqrt / Math.sqrt(bias_correction2)).add!(group[:eps])
@@ -69,10 +69,10 @@
denom = (exp_avg_sq.sqrt / Math.sqrt(bias_correction2)).add!(group[:eps])
end
step_size = group[:lr] / bias_correction1
- p.data.addcdiv!(-step_size, exp_avg, denom)
+ p.data.addcdiv!(exp_avg, denom, value: -step_size)
end
end
loss
end