/** * The MurmurHash3 algorithm was created by Austin Appleby and placed in the public domain. * This java port was authored by Yonik Seeley and also placed into the public domain. * The author hereby disclaims copyright to this source code. *

* This produces exactly the same hash values as the final C++ * version of MurmurHash3 and is thus suitable for producing the same hash values across * platforms. *

* The 32 bit x86 version of this hash should be the fastest variant for relatively short keys like ids. * murmurhash3_x64_128 is a good choice for longer strings or if you need more than 32 bits of hash. *

* Note - The x86 and x64 versions do _not_ produce the same results, as the * algorithms are optimized for their respective platforms. *

* See http://github.com/yonik/java_util for future updates to this file. */ public final class MurmurHash3 { /** * 128 bits of state */ public static final class LongPair { public long val1; public long val2; } public static final int fmix32(int h) { h ^= h >>> 16; h *= 0x85ebca6b; h ^= h >>> 13; h *= 0xc2b2ae35; h ^= h >>> 16; return h; } public static final long fmix64(long k) { k ^= k >>> 33; k *= 0xff51afd7ed558ccdL; k ^= k >>> 33; k *= 0xc4ceb9fe1a85ec53L; k ^= k >>> 33; return k; } /** * Gets a long from a byte buffer in little endian byte order. */ public static final long getLongLittleEndian(byte[] buf, int offset) { return ((long) buf[offset + 7] << 56) // no mask needed | ((buf[offset + 6] & 0xffL) << 48) | ((buf[offset + 5] & 0xffL) << 40) | ((buf[offset + 4] & 0xffL) << 32) | ((buf[offset + 3] & 0xffL) << 24) | ((buf[offset + 2] & 0xffL) << 16) | ((buf[offset + 1] & 0xffL) << 8) | ((buf[offset] & 0xffL)); // no shift needed } /** * Returns the MurmurHash3_x86_32 hash of the UTF-8 bytes of the String without actually encoding * the string to a temporary buffer. This is more than 2x faster than hashing the result * of String.getBytes(). */ public static long murmurhash3_x86_32(CharSequence data, int seed) { final int c1 = 0xcc9e2d51; final int c2 = 0x1b873593; int h1 = seed; int offset = 0; int len = data.length(); int pos = offset; int end = offset + len; int k1 = 0; int k2 = 0; int shift = 0; int bits = 0; int nBytes = 0; // length in UTF8 bytes while (pos < end) { int code = data.charAt(pos++); if (code < 0x80) { k2 = code; bits = 8; } else if (code < 0x800) { k2 = (0xC0 | (code >> 6)) | ((0x80 | (code & 0x3F)) << 8); bits = 16; } else if (code < 0xD800 || code > 0xDFFF || pos >= end) { // we check for pos>=end to encode an unpaired surrogate as 3 bytes. k2 = (0xE0 | (code >> 12)) | ((0x80 | ((code >> 6) & 0x3F)) << 8) | ((0x80 | (code & 0x3F)) << 16); bits = 24; } else { // surrogate pair // int utf32 = pos < end ? (int) data.charAt(pos++) : 0; int utf32 = (int) data.charAt(pos++); utf32 = ((code - 0xD7C0) << 10) + (utf32 & 0x3FF); k2 = (0xff & (0xF0 | (utf32 >> 18))) | ((0x80 | ((utf32 >> 12) & 0x3F))) << 8 | ((0x80 | ((utf32 >> 6) & 0x3F))) << 16 | (0x80 | (utf32 & 0x3F)) << 24; bits = 32; } k1 |= k2 << shift; // int used_bits = 32 - shift; // how many bits of k2 were used in k1. // int unused_bits = bits - used_bits; // (bits-(32-shift)) == bits+shift-32 == bits-newshift shift += bits; if (shift >= 32) { // mix after we have a complete word k1 *= c1; k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15); k1 *= c2; h1 ^= k1; h1 = (h1 << 13) | (h1 >>> 19); // ROTL32(h1,13); h1 = h1 * 5 + 0xe6546b64; shift -= 32; // unfortunately, java won't let you shift 32 bits off, so we need to check for 0 if (shift != 0) { k1 = k2 >>> (bits - shift); // bits used == bits - newshift } else { k1 = 0; } nBytes += 4; } } // inner // handle tail if (shift > 0) { nBytes += shift >> 3; k1 *= c1; k1 = (k1 << 15) | (k1 >>> 17); // ROTL32(k1,15); k1 *= c2; h1 ^= k1; } // finalization h1 ^= nBytes; // fmix(h1); h1 ^= h1 >>> 16; h1 *= 0x85ebca6b; h1 ^= h1 >>> 13; h1 *= 0xc2b2ae35; h1 ^= h1 >>> 16; return h1 & 0xFFFFFFFFL; } // The following set of methods and constants are borrowed from: // `This method is borrowed from `org.apache.commons.codec.digest.MurmurHash3` // Constants for 128-bit variant private static final long C1 = 0x87c37b91114253d5L; private static final long C2 = 0x4cf5ad432745937fL; private static final int R1 = 31; private static final int R2 = 27; private static final int R3 = 33; private static final int M = 5; private static final int N1 = 0x52dce729; private static final int N2 = 0x38495ab5; /** * Gets the little-endian long from 8 bytes starting at the specified index. * * @param data The data * @param index The index * @return The little-endian long */ private static long getLittleEndianLong(final byte[] data, final int index) { return (((long) data[index ] & 0xff) ) | (((long) data[index + 1] & 0xff) << 8) | (((long) data[index + 2] & 0xff) << 16) | (((long) data[index + 3] & 0xff) << 24) | (((long) data[index + 4] & 0xff) << 32) | (((long) data[index + 5] & 0xff) << 40) | (((long) data[index + 6] & 0xff) << 48) | (((long) data[index + 7] & 0xff) << 56); } public static long[] hash128x64(final byte[] data) { return hash128x64(data, 0, data.length, 0); } /** * Generates 128-bit hash from the byte array with the given offset, length and seed. * *

This is an implementation of the 128-bit hash function {@code MurmurHash3_x64_128} * from from Austin Applyby's original MurmurHash3 {@code c++} code in SMHasher.

* * @param data The input byte array * @param offset The first element of array * @param length The length of array * @param seed The initial seed value * @return The 128-bit hash (2 longs) */ public static long[] hash128x64(final byte[] data, final int offset, final int length, final long seed) { long h1 = seed; long h2 = seed; final int nblocks = length >> 4; // body for (int i = 0; i < nblocks; i++) { final int index = offset + (i << 4); long k1 = getLittleEndianLong(data, index); long k2 = getLittleEndianLong(data, index + 8); // mix functions for k1 k1 *= C1; k1 = Long.rotateLeft(k1, R1); k1 *= C2; h1 ^= k1; h1 = Long.rotateLeft(h1, R2); h1 += h2; h1 = h1 * M + N1; // mix functions for k2 k2 *= C2; k2 = Long.rotateLeft(k2, R3); k2 *= C1; h2 ^= k2; h2 = Long.rotateLeft(h2, R1); h2 += h1; h2 = h2 * M + N2; } // tail long k1 = 0; long k2 = 0; final int index = offset + (nblocks << 4); switch (offset + length - index) { case 15: k2 ^= ((long) data[index + 14] & 0xff) << 48; case 14: k2 ^= ((long) data[index + 13] & 0xff) << 40; case 13: k2 ^= ((long) data[index + 12] & 0xff) << 32; case 12: k2 ^= ((long) data[index + 11] & 0xff) << 24; case 11: k2 ^= ((long) data[index + 10] & 0xff) << 16; case 10: k2 ^= ((long) data[index + 9] & 0xff) << 8; case 9: k2 ^= data[index + 8] & 0xff; k2 *= C2; k2 = Long.rotateLeft(k2, R3); k2 *= C1; h2 ^= k2; case 8: k1 ^= ((long) data[index + 7] & 0xff) << 56; case 7: k1 ^= ((long) data[index + 6] & 0xff) << 48; case 6: k1 ^= ((long) data[index + 5] & 0xff) << 40; case 5: k1 ^= ((long) data[index + 4] & 0xff) << 32; case 4: k1 ^= ((long) data[index + 3] & 0xff) << 24; case 3: k1 ^= ((long) data[index + 2] & 0xff) << 16; case 2: k1 ^= ((long) data[index + 1] & 0xff) << 8; case 1: k1 ^= data[index] & 0xff; k1 *= C1; k1 = Long.rotateLeft(k1, R1); k1 *= C2; h1 ^= k1; } // finalization h1 ^= length; h2 ^= length; h1 += h2; h2 += h1; h1 = fmix64(h1); h2 = fmix64(h2); h1 += h2; h2 += h1; return new long[] { h1, h2 }; } }