ext/numo/narray/gen/tmpl/unary_s.c in numo-narray-0.9.1.4 vs ext/numo/narray/gen/tmpl/unary_s.c in numo-narray-0.9.1.5

- old
+ new

@@ -42,47 +42,71 @@ GET_DATA_STRIDE(p1,s1,dtype,x); x = m_<%=name%>(x); SET_DATA_INDEX(p2,idx2,dtype,x); } } else { -<% if is_simd and !is_complex and %w[sqrt].include? name %> - // Check number of elements. & Check same alignment. - if ((n >= num_pack) && is_same_aligned2(&((dtype*)p1)[i], &((dtype*)p2)[i], SIMD_ALIGNMENT_SIZE)){ - // Calculate up to the position just before the start of SIMD computation. - cnt = get_count_of_elements_not_aligned_to_simd_size(&((dtype*)p1)[i], SIMD_ALIGNMENT_SIZE, sizeof(dtype)); - for (i=0; i < cnt; i++) { - ((dtype*)p2)[i] = m_<%=name%>(((dtype*)p1)[i]); - } + //<% if need_align %> + if (is_aligned(p1,sizeof(dtype)) && + is_aligned(p2,sizeof(dtype)) ) { + if (s1 == sizeof(dtype) && + s2 == sizeof(dtype) ) { + //<% if is_simd and !is_complex and %w[sqrt].include? name %> + // Check number of elements. & Check same alignment. + if ((n >= num_pack) && is_same_aligned2(&((dtype*)p1)[i], &((dtype*)p2)[i], SIMD_ALIGNMENT_SIZE)){ + // Calculate up to the position just before the start of SIMD computation. + cnt = get_count_of_elements_not_aligned_to_simd_size(&((dtype*)p1)[i], SIMD_ALIGNMENT_SIZE, sizeof(dtype)); + for (i=0; i < cnt; i++) { + ((dtype*)p2)[i] = m_<%=name%>(((dtype*)p1)[i]); + } - // Get the count of SIMD computation loops. - cnt_simd_loop = (n - i) % num_pack; + // Get the count of SIMD computation loops. + cnt_simd_loop = (n - i) % num_pack; - // SIMD computation. - if (p1 == p2) { // inplace case - for(; i < n - cnt_simd_loop; i += num_pack){ - a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]); - a = _mm_<%=name%>_<%=simd_type%>(a); - _mm_store_<%=simd_type%>(&((dtype*)p1)[i], a); + // SIMD computation. + if (p1 == p2) { // inplace case + for(; i < n - cnt_simd_loop; i += num_pack){ + a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]); + a = _mm_<%=name%>_<%=simd_type%>(a); + _mm_store_<%=simd_type%>(&((dtype*)p1)[i], a); + } + } else { + for(; i < n - cnt_simd_loop; i += num_pack){ + a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]); + a = _mm_<%=name%>_<%=simd_type%>(a); + _mm_stream_<%=simd_type%>(&((dtype*)p2)[i], a); + } + } + } - } else { - for(; i < n - cnt_simd_loop; i += num_pack){ - a = _mm_load_<%=simd_type%>(&((dtype*)p1)[i]); - a = _mm_<%=name%>_<%=simd_type%>(a); - _mm_stream_<%=simd_type%>(&((dtype*)p2)[i], a); + // Compute the remainder of the SIMD operation. + if (cnt_simd_loop != 0){ + //<% end %> + for (; i<n; i++) { + ((dtype*)p2)[i] = m_<%=name%>(((dtype*)p1)[i]); + } + //<% if is_simd and !is_complex and %w[sqrt].include? name %> } + //<% end %> + return; } - - } - // Compute the remainder of the SIMD operation. - if (cnt_simd_loop != 0){ -<% end %> - for (; i<n; i++) { - ((dtype*)p2)[i] = m_<%=name%>(((dtype*)p1)[i]); + if (is_aligned_step(s1,sizeof(dtype)) && + is_aligned_step(s2,sizeof(dtype)) ) { + //<% end %> + for (i=0; i<n; i++) { + *(dtype*)p2 = m_<%=name%>(*(dtype*)p1); + p1 += s1; + p2 += s2; + } + return; + //<% if need_align %> } -<% if is_simd and !is_complex and %w[sqrt].include? name %> } -<% end %> - return; + for (i=0; i<n; i++) { + GET_DATA_STRIDE(p1,s1,dtype,x); + x = m_<%=name%>(x); + SET_DATA_STRIDE(p2,s2,dtype,x); + } + //<% end %> } } } /*