// Start of the kernel (main, not unrolled kernel) __kernel void bones_kernel__0(int bones_input_size, __global , __global , ) { const int bones_threadblock_work = DIV_CEIL(bones_input_size,get_num_groups(0)); const int bones_parallel_work = BONES_MIN(get_local_size(0),bones_threadblock_work); const int bones_sequential_work = DIV_CEIL(bones_threadblock_work,bones_parallel_work); const int bones_local_id = get_local_id(0); const int bones_global_id = get_global_id(0); int bones_iter_id = ; // Load data into thread private memory and perform the first computation(s) sequentially bones_temporary = [bones_iter_id]; bones_private_memory = ; for(int c=1; c; if (bones_iter_id <= ) { bones_temporary = [bones_iter_id]; bones_private_memory = ; } } // Initialize the local memory volatile __local bones_local_memory[256]; bones_local_memory[bones_local_id] = bones_private_memory; barrier(CLK_LOCAL_MEM_FENCE); // Perform the remainder of the computations in parallel using a parallel reduction tree int bones_offset_id; for (int c=256; c>=2; c=c>>1) { if ((2*bones_parallel_work > c) && (get_local_id(0) < c/2)) { bones_offset_id = get_local_id(0)+c/2; if (bones_offset_id < bones_parallel_work) { bones_local_memory[bones_local_id] = ; } } barrier(CLK_LOCAL_MEM_FENCE); } // Write the final result back to the global memory if (get_local_id(0) == 0) { [get_group_id(0)] = bones_local_memory[0]; } } // Start of the kernel (secondary, not unrolled kernel) __kernel void bones_kernel__1(__global , __global ) { const int bones_local_id = get_local_id(0); const int bones_global_id = get_local_id(0); // Initialize the local memory volatile __local bones_local_memory[128]; bones_local_memory[bones_local_id] = [bones_global_id]; barrier(CLK_LOCAL_MEM_FENCE); // Perform reduction using a parallel reduction tree int bones_offset_id; for (int c=128; c>=2; c=c>>1) { if (get_local_id(0) < c/2) { bones_offset_id = get_local_id(0)+c/2; bones_local_memory[bones_local_id] = ; } barrier(CLK_LOCAL_MEM_FENCE); } // Write the final result back to the global memory if (get_local_id(0) == 0) { [0] = bones_local_memory[0]; } } // Start of the kernel (final, initial value kernel) __kernel void bones_kernel__2(__global bones_initial_value, __global ) { bones_private_memory = [0]; bones_temporary = bones_initial_value[0]; [0] = ; }