% ctype = dtype_to_c_type(data_type) __kernel void pack(const int N, const int index, __global const <%= ctype %> *A, __global <%= ctype %> *C) { // Get the index of the current element to be processed const int globalCol = get_global_id(0); // Col ID of C (0..N) int start = index * <%= divisors[0] %>; int ptr = start + globalCol; int index_map[<%= divisors.size %>] = { <%= Array.new(divisors.size) { 0 }.join(', ') %> }; // compute effective coordinates <% divisors.each_with_index do |div, index| %> index_map[<%= index %>] = (int)floor(ptr / (float)<%= div %>);<% if index < divisors.size - 1%>ptr = ptr % <%= div %>;<% end %><% end %> // Apply axis translation if needed <% if axis > 0 %> int first = index_map[0]; <% axis.times do |i| %> index_map[<%= i %>] = index_map[<%= (i + 1) %>];<% end %> index_map[<%= axis %>] = first; <% end%> C[<%= multipliers.each_with_index.map { |m, idx| "#{m}*index_map[#{idx}]" }.join(' + ') %>] = A[globalCol]; }