Sha256: 6d0f5ac68b4d93efb40f754dbe660250eda0fc7105dadbb1777f99ba4b8b18aa
Contents?: true
Size: 1.67 KB
Versions: 1
Compression:
Stored size: 1.67 KB
Contents
static inline uint8x16x4_t enc_reshuffle (const uint8x16x3_t in) { uint8x16x4_t out; #if defined(__GNUC__) || defined(__clang__) // GCC and Clang support the following inline assembly syntax. This // inline assembly implements the exact same algorithm as the // intrinsics further down, but benchmarks show that the inline // assembly easily beats the intrinsics. Perhaps this is because the // inline assembly is well pipelined to avoid data dependencies. __asm__ ( "ushr %[o0].16b, %[i0].16b, #2 \n\t" "ushr %[o1].16b, %[i1].16b, #2 \n\t" "ushr %[o2].16b, %[i2].16b, #4 \n\t" "sli %[o1].16b, %[i0].16b, #6 \n\t" "sli %[o2].16b, %[i1].16b, #4 \n\t" "shl %[o3].16b, %[i2].16b, #2 \n\t" "ushr %[o1].16b, %[o1].16b, #2 \n\t" "ushr %[o2].16b, %[o2].16b, #2 \n\t" "ushr %[o3].16b, %[o3].16b, #2 \n\t" // Outputs: : [o0] "=&w" (out.val[0]), [o1] "=&w" (out.val[1]), [o2] "=&w" (out.val[2]), [o3] "=&w" (out.val[3]) // Inputs: : [i0] "w" (in.val[0]), [i1] "w" (in.val[1]), [i2] "w" (in.val[2]) ); #else // Divide bits of three input bytes over four output bytes. All output // bytes except the first one are shifted over two bits to the left: out.val[0] = vshrq_n_u8(in.val[0], 2); out.val[1] = vshrq_n_u8(in.val[1], 2); out.val[2] = vshrq_n_u8(in.val[2], 4); out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 6); out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 4); out.val[3] = vshlq_n_u8(in.val[2], 2); // Clear the top two bits by shifting the output back to the right: out.val[1] = vshrq_n_u8(out.val[1], 2); out.val[2] = vshrq_n_u8(out.val[2], 2); out.val[3] = vshrq_n_u8(out.val[3], 2); #endif return out; }
Version data entries
1 entries across 1 versions & 1 rubygems
Version | Path |
---|---|
ob64-0.5.0 | vendor/libbase64/lib/arch/neon64/enc_reshuffle.c |