bench/cumo_bench.rb in cumo-0.1.0 vs bench/cumo_bench.rb in cumo-0.1.1

- old
+ new

@@ -6,157 +6,179 @@ a = Cumo::Float32.new(10).seq(1) b = Cumo::Float32.new(10).seq(10,10) c = a + b c.free -def elementwise - puts 'element-wise' +def elementwise(num = nil) + num ||= NUM + puts "elementwise(#{num})" Benchmark.bm do |r| a = Cumo::Float32.new(10000).seq(1) b = Cumo::Float32.new(10000).seq(10,10) (a + b).free # warm up + Cumo::CUDA::Runtime.cudaDeviceSynchronize r.report('10**4') do NUM.times do (a + b).free Cumo::CUDA::Runtime.cudaDeviceSynchronize end end a = Cumo::Float32.new(100000).seq(1) b = Cumo::Float32.new(100000).seq(10,10) (a + b).free # warm up + Cumo::CUDA::Runtime.cudaDeviceSynchronize r.report('10**5') do NUM.times do (a + b).free Cumo::CUDA::Runtime.cudaDeviceSynchronize end end a = Cumo::Float32.new(1000000).seq(1) b = Cumo::Float32.new(1000000).seq(10,10) (a + b).free # warm up + Cumo::CUDA::Runtime.cudaDeviceSynchronize r.report('10**6') do NUM.times do (a + b).free Cumo::CUDA::Runtime.cudaDeviceSynchronize end end a = Cumo::Float32.new(10000000).seq(1) b = Cumo::Float32.new(10000000).seq(10,10) (a + b).free # warm up + Cumo::CUDA::Runtime.cudaDeviceSynchronize r.report('10**7') do NUM.times do (a + b).free Cumo::CUDA::Runtime.cudaDeviceSynchronize end end a = Cumo::Float32.new(100000000).seq(1) b = Cumo::Float32.new(100000000).seq(10,10) (a + b).free # warm up + Cumo::CUDA::Runtime.cudaDeviceSynchronize r.report('10**8') do NUM.times do (a + b).free Cumo::CUDA::Runtime.cudaDeviceSynchronize end end end end -def reduction - puts 'reduction' +def reduction(num = nil) + num ||= NUM + puts "reduction(#{num})" Benchmark.bm do |r| a = Cumo::Float32.new(10000).seq(1) + (a.sum).free # warm up + Cumo::CUDA::Runtime.cudaDeviceSynchronize r.report('10**4') do NUM.times do (a.sum).free Cumo::CUDA::Runtime.cudaDeviceSynchronize end end a = Cumo::Float32.new(100000).seq(1) + (a.sum).free # warm up + Cumo::CUDA::Runtime.cudaDeviceSynchronize r.report('10**5') do NUM.times do (a.sum).free Cumo::CUDA::Runtime.cudaDeviceSynchronize end end a = Cumo::Float32.new(1000000).seq(1) + (a.sum).free # warm up + Cumo::CUDA::Runtime.cudaDeviceSynchronize r.report('10**6') do NUM.times do (a.sum).free Cumo::CUDA::Runtime.cudaDeviceSynchronize end end a = Cumo::Float32.new(10000000).seq(1) + (a.sum).free # warm up + Cumo::CUDA::Runtime.cudaDeviceSynchronize r.report('10**7') do NUM.times do (a.sum).free Cumo::CUDA::Runtime.cudaDeviceSynchronize end end a = Cumo::Float32.new(100000000).seq(1) + (a.sum).free # warm up + Cumo::CUDA::Runtime.cudaDeviceSynchronize r.report('10**8') do NUM.times do (a.sum).free Cumo::CUDA::Runtime.cudaDeviceSynchronize end end end end -def dot - num = 3 - puts 'dot' +def dot(num = nil) + num ||= 1 + puts "dot(#{num})" Benchmark.bm do |r| a = Cumo::Float32.new(100,100).seq(1) b = Cumo::Float32.new(100,100).seq(10,10) a.dot(b).free # warm up + Cumo::CUDA::Runtime.cudaDeviceSynchronize r.report('10**4') do num.times do a.dot(b).free Cumo::CUDA::Runtime.cudaDeviceSynchronize end end a = Cumo::Float32.new(100,1000).seq(1) b = Cumo::Float32.new(1000,100).seq(10,10) a.dot(b).free # warm up + Cumo::CUDA::Runtime.cudaDeviceSynchronize r.report('10**5') do num.times do a.dot(b).free Cumo::CUDA::Runtime.cudaDeviceSynchronize end end a = Cumo::Float32.new(100,10000).seq(1) b = Cumo::Float32.new(10000,100).seq(10,10) a.dot(b).free # warm up + Cumo::CUDA::Runtime.cudaDeviceSynchronize r.report('10**6') do num.times do a.dot(b).free Cumo::CUDA::Runtime.cudaDeviceSynchronize end end a = Cumo::Float32.new(100,100000).seq(1) b = Cumo::Float32.new(100000,100).seq(10,10) a.dot(b).free # warm up + Cumo::CUDA::Runtime.cudaDeviceSynchronize r.report('10**7') do num.times do a.dot(b).free Cumo::CUDA::Runtime.cudaDeviceSynchronize end end a = Cumo::Float32.new(100,1000000).seq(1) b = Cumo::Float32.new(1000000,100).seq(10,10) a.dot(b).free # warm up + Cumo::CUDA::Runtime.cudaDeviceSynchronize r.report('10**8') do num.times do a.dot(b).free Cumo::CUDA::Runtime.cudaDeviceSynchronize end @@ -168,26 +190,26 @@ reduction dot # Tesla V100-SXM2... # -# element-wise +# element-wise(100) # user system total real -# 10**4 0.000000 0.000000 0.000000 ( 0.005769) -# 10**5 0.010000 0.000000 0.010000 ( 0.006609) -# 10**6 0.000000 0.010000 0.010000 ( 0.010313) -# 10**7 0.040000 0.010000 0.050000 ( 0.050986) -# 10**8 0.310000 0.130000 0.440000 ( 0.449699) -# reduction +# 10**4 0.000000 0.000000 0.000000 ( 0.006332) +# 10**5 0.000000 0.000000 0.000000 ( 0.006280) +# 10**6 0.010000 0.000000 0.010000 ( 0.008123) +# 10**7 0.000000 0.010000 0.010000 ( 0.022176) +# 10**8 0.100000 0.050000 0.150000 ( 0.151999) +# reduction(100) # user system total real -# 10**4 0.010000 0.000000 0.010000 ( 0.009484) -# 10**5 0.020000 0.010000 0.030000 ( 0.022071) -# 10**6 0.100000 0.050000 0.150000 ( 0.152070) -# 10**7 1.150000 0.600000 1.750000 ( 1.754977) -# 10**8 11.720000 5.750000 17.470000 ( 17.470990) -# dot +# 10**4 0.010000 0.000000 0.010000 ( 0.009735) +# 10**5 0.010000 0.010000 0.020000 ( 0.022882) +# 10**6 0.110000 0.050000 0.160000 ( 0.154641) +# 10**7 1.220000 0.590000 1.810000 ( 1.805643) +# 10**8 11.840000 6.110000 17.950000 ( 17.946511) +# dot(1) # user system total real -# 10**4 0.000000 0.000000 0.000000 ( 0.000351) -# 10**5 0.000000 0.000000 0.000000 ( 0.000838) -# 10**6 0.000000 0.000000 0.000000 ( 0.002702) -# 10**7 0.020000 0.010000 0.030000 ( 0.024650) -# 10**8 0.180000 0.060000 0.240000 ( 0.245101) +# 10**4 0.000000 0.000000 0.000000 ( 0.000206) +# 10**5 0.000000 0.000000 0.000000 ( 0.000195) +# 10**6 0.000000 0.000000 0.000000 ( 0.000239) +# 10**7 0.000000 0.000000 0.000000 ( 0.000719) +# 10**8 0.010000 0.000000 0.010000 ( 0.004636)