Sha256: 6d0f5ac68b4d93efb40f754dbe660250eda0fc7105dadbb1777f99ba4b8b18aa

Contents?: true

Size: 1.67 KB

Versions: 1

Compression:

Stored size: 1.67 KB

Contents

static inline uint8x16x4_t
enc_reshuffle (const uint8x16x3_t in)
{
	uint8x16x4_t out;

#if defined(__GNUC__) || defined(__clang__)

	// GCC and Clang support the following inline assembly syntax. This
	// inline assembly implements the exact same algorithm as the
	// intrinsics further down, but benchmarks show that the inline
	// assembly easily beats the intrinsics. Perhaps this is because the
	// inline assembly is well pipelined to avoid data dependencies.

	__asm__ (
		"ushr %[o0].16b, %[i0].16b, #2    \n\t"
		"ushr %[o1].16b, %[i1].16b, #2    \n\t"
		"ushr %[o2].16b, %[i2].16b, #4    \n\t"
		"sli  %[o1].16b, %[i0].16b, #6    \n\t"
		"sli  %[o2].16b, %[i1].16b, #4    \n\t"
		"shl  %[o3].16b, %[i2].16b, #2    \n\t"

		"ushr %[o1].16b, %[o1].16b, #2    \n\t"
		"ushr %[o2].16b, %[o2].16b, #2    \n\t"
		"ushr %[o3].16b, %[o3].16b, #2    \n\t"

		// Outputs:
		: [o0] "=&w" (out.val[0]),
		  [o1] "=&w" (out.val[1]),
		  [o2] "=&w" (out.val[2]),
		  [o3] "=&w" (out.val[3])

		// Inputs:
		: [i0] "w" (in.val[0]),
		  [i1] "w" (in.val[1]),
		  [i2] "w" (in.val[2])
	);
#else
	// Divide bits of three input bytes over four output bytes. All output
	// bytes except the first one are shifted over two bits to the left:
	out.val[0] = vshrq_n_u8(in.val[0], 2);
	out.val[1] = vshrq_n_u8(in.val[1], 2);
	out.val[2] = vshrq_n_u8(in.val[2], 4);
	out.val[1] = vsliq_n_u8(out.val[1], in.val[0], 6);
	out.val[2] = vsliq_n_u8(out.val[2], in.val[1], 4);
	out.val[3] = vshlq_n_u8(in.val[2], 2);

	// Clear the top two bits by shifting the output back to the right:
	out.val[1] = vshrq_n_u8(out.val[1], 2);
	out.val[2] = vshrq_n_u8(out.val[2], 2);
	out.val[3] = vshrq_n_u8(out.val[3], 2);
#endif

	return out;
}

Version data entries

1 entries across 1 versions & 1 rubygems

Version Path
ob64-0.5.0 vendor/libbase64/lib/arch/neon64/enc_reshuffle.c