% ctype = dtype_to_c_type(data_type) __kernel void concat(const int N, const int index, const int step, __global const <%= ctype %> *A, __global <%= ctype %> *C) { // Get the index of the current element to be processed const int globalCol = get_global_id(0); // Col ID of C (0..N) int ptr = globalCol; // compute effective coordinates <% divisors.each_with_index do |div, index| %> <% if axis == index %> int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>) + step; <% else %> int index_map_<%= index %> = (int)floor(ptr / (float)<%= div %>); <% end %> <% if index < divisors.size - 1%> ptr = ptr % <%= div %>; <% end %> <% end %> C[<%= multipliers.each_with_index.map { |m, idx| "#{m}*index_map_#{idx}" }.join(' + ') %>] = A[globalCol]; }