/* STARTDEF void bones_prekernel__0(, ); ENDDEF */ // Start of the kernel (main, not unrolled kernel) __global__ void bones_kernel__0(int bones_input_size, , , ) { const int bones_threadblock_work = DIV_CEIL(bones_input_size,gridDim.x); const int bones_parallel_work = BONES_MIN(blockDim.x,bones_threadblock_work); const int bones_sequential_work = DIV_CEIL(bones_threadblock_work,bones_parallel_work); const int bones_local_id = threadIdx.x; const int bones_global_id = blockIdx.x*bones_parallel_work + threadIdx.x; int bones_iter_id = ; // Load data into thread private memory and perform the first computation(s) sequentially bones_temporary = [bones_iter_id]; bones_private_memory = ; for(int c=1; c; if (bones_iter_id <= ) { bones_temporary = [bones_iter_id]; bones_private_memory = ; } } // Initialize the local memory volatile __shared__ bones_local_memory[512]; bones_local_memory[bones_local_id] = bones_private_memory; __syncthreads(); // Perform the remainder of the computations in parallel using a parallel reduction tree int bones_offset_id; for (int c=512; c>=2; c=c>>1) { if ((2*bones_parallel_work > c) && (threadIdx.x < c/2)) { bones_offset_id = threadIdx.x+c/2; if (bones_offset_id < bones_parallel_work) { __syncthreads(); bones_local_memory[bones_local_id] = ; } } __syncthreads(); } // Write the final result back to the global memory if (threadIdx.x == 0) { [blockIdx.x] = bones_local_memory[0]; } } // Start of the kernel (secondary, not unrolled kernel) __global__ void bones_kernel__1( , , ) { const int bones_local_id = threadIdx.x; const int bones_global_id = threadIdx.x; // Initialize the local memory volatile __shared__ bones_local_memory[512]; bones_local_memory[bones_local_id] = [bones_global_id]; __syncthreads(); // Perform reduction using a parallel reduction tree int bones_offset_id; for (int c=128; c>=2; c=c>>1) { if (threadIdx.x < c/2) { bones_offset_id = threadIdx.x+c/2; bones_local_memory[bones_local_id] = ; __syncthreads(); } } // Write the final result back to the global memory if (threadIdx.x == 0) { [0] = bones_local_memory[0]; } } // Start of the kernel (final, initial value kernel) __global__ void bones_kernel__2( bones_initial_value, , ) { bones_private_memory = [0]; bones_temporary = bones_initial_value[0]; [0] = ; } // Function to start the kernel extern "C" void bones_prekernel__0(, ) { // Store the initial value * bones_initial_value = 0; cudaMalloc(&bones_initial_value, sizeof()); cudaMemcpy(bones_initial_value, , sizeof(), cudaMemcpyDeviceToDevice); // Run either one kernel or multiple kernels if ( <= 1024) { // Start only one kernel const int bones_num_threads = DIV_CEIL(,2); dim3 bones_threads(bones_num_threads); dim3 bones_grid(1); bones_kernel__0<<< bones_grid, bones_threads >>>(,,,); } else { // Allocate space for an intermediate array * bones_device_temp = 0; cudaMalloc(&bones_device_temp, 128*sizeof()); // Start the first kernel dim3 bones_threads1(512); dim3 bones_grid1(128); bones_kernel__0<<< bones_grid1, bones_threads1 >>>(,,bones_device_temp,); // Start the second kernel dim3 bones_threads2(128); dim3 bones_grid2(1); bones_kernel__1<<< bones_grid2, bones_threads2 >>>(bones_device_temp,,); cudaFree(bones_device_temp); } // Perform the last computation (only needed if there is an initial value) dim3 bones_threads3(1); dim3 bones_grid3(1); bones_kernel__2<<< bones_grid3, bones_threads3 >>>(bones_initial_value,,); cudaFree(bones_initial_value); }