/* STARTDEF void bones_prekernel__0(, ); ENDDEF */ #define SHUFFLE_X 16 #define SHUFFLE_Y 16 // Start of the kernel __global__ void bones_kernel__0(, ) { const int bones_global_id = blockIdx.x*blockDim.x + threadIdx.x; if (bones_global_id < ()) { // Calculate the global ID(s) based on the thread id // Start the computation } } // Start of the kernel (pre-kernel for shuffling) - for first input __global__ void bones_kernel__1( , shuffled_, ) { const int bones_global_id_0 = blockIdx.x*blockDim.x + threadIdx.x; const int bones_global_id_1 = blockIdx.y*blockDim.y + threadIdx.y; // Set-up the local memory for shuffling __shared__ buffer[SHUFFLE_X][SHUFFLE_Y]; // Swap the x and y coordinates to perform the rotation (coalesced) if (bones_global_id_0 < (()/()) && bones_global_id_1 < ()) { buffer[threadIdx.y][threadIdx.x] = [bones_global_id_0 + bones_global_id_1 * (()/())]; } // Synchronize all threads in the threadblock __syncthreads(); // We don't have to swap the x and y thread indices here, because that's already done in the local memory const int bones_global_id_0_new = blockIdx.y*blockDim.y + threadIdx.x; const int bones_global_id_1_new = blockIdx.x*blockDim.x + threadIdx.y; // Store the shuffled result (coalesced) if (bones_global_id_0_new < (()/()) && bones_global_id_1_new < ()) { shuffled_[bones_global_id_0_new + bones_global_id_1_new * ] = buffer[threadIdx.x][threadIdx.y]; } } // Start of the kernel (pre-kernel for shuffling) - for second input __global__ void bones_kernel__2( , shuffled_, ) { const int bones_global_id_0 = blockIdx.x*blockDim.x + threadIdx.x; const int bones_global_id_1 = blockIdx.y*blockDim.y + threadIdx.y; // Set-up the local memory for shuffling __shared__ buffer[SHUFFLE_X][SHUFFLE_Y]; // Swap the x and y coordinates to perform the rotation (coalesced) if (bones_global_id_0 < (()/()) && bones_global_id_1 < ()) { buffer[threadIdx.y][threadIdx.x] = [bones_global_id_0 + bones_global_id_1 * (()/())]; } // Synchronize all threads in the threadblock __syncthreads(); // We don't have to swap the x and y thread indices here, because that's already done in the local memory const int bones_global_id_0_new = blockIdx.y*blockDim.y + threadIdx.x; const int bones_global_id_1_new = blockIdx.x*blockDim.x + threadIdx.y; // Store the shuffled result (coalesced) if (bones_global_id_0_new < (()/()) && bones_global_id_1_new < ()) { shuffled_[bones_global_id_0_new + bones_global_id_1_new * ] = buffer[threadIdx.x][threadIdx.y]; } } // Function to start the kernel extern "C" void bones_prekernel__0(, ) { int bones_block_size; if ( >= 64*512) { bones_block_size = 512;} else if ( >= 64*256) { bones_block_size = 256;} else if ( >= 64*128) { bones_block_size = 128;} else if ( >= 64*64 ) { bones_block_size = 64; } else { bones_block_size = 32; } // First perform some pre-shuffling (for the first input) * shuffled_ = 0; cudaMalloc((void**)&shuffled_, *sizeof()); dim3 bones_threads1(SHUFFLE_X,SHUFFLE_Y); dim3 bones_grid1(DIV_CEIL((()/()),SHUFFLE_X),DIV_CEIL(,SHUFFLE_Y)); bones_kernel__1<<< bones_grid1, bones_threads1 >>>(, shuffled_, ); * temp_ = ; = shuffled_; cudaFree(temp_); // First perform some pre-shuffling (for the second input) * shuffled_ = 0; cudaMalloc((void**)&shuffled_, *sizeof()); dim3 bones_threads2(SHUFFLE_X,SHUFFLE_Y); dim3 bones_grid2(DIV_CEIL((()/()),SHUFFLE_X),DIV_CEIL(,SHUFFLE_Y)); bones_kernel__2<<< bones_grid2, bones_threads2 >>>(, shuffled_, ); * temp_ = ; = shuffled_; cudaFree(temp_); // Then run the original kernel dim3 bones_threads0(bones_block_size); dim3 bones_grid0(DIV_CEIL(,bones_block_size)); bones_kernel__0<<< bones_grid0, bones_threads0 >>>(, ); }