HADOOP-14313. Replace/improve Hadoop's byte[] comparator. Contributed by Vikas Vishwakarma.

2018-06-28 14:58:40 +09:00 · 2018-06-28 14:58:40 +09:00 · ddbff7c8d3
parent 2b2399d623
commit ddbff7c8d3
2 changed files with 25 additions and 27 deletions
--- a/NOTICE.txt
+++ b/NOTICE.txt
@ -196,6 +196,14 @@ by Google Inc, which can be obtained at:
  * HOMEPAGE:
    * http://code.google.com/p/snappy/

+This product contains a modified portion of UnsignedBytes LexicographicalComparator
+from Guava v21 project by Google Inc, which can be obtained at:
+
+  * LICENSE:
+    * license/COPYING (Apache License 2.0)
+  * HOMEPAGE:
+    * https://github.com/google/guava
+
 This product optionally depends on 'JBoss Marshalling', an alternative Java
 serialization API, which can be obtained at:

--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/FastByteComparisons.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/io/FastByteComparisons.java
@ -26,7 +26,6 @@ import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import sun.misc.Unsafe;

-import com.google.common.primitives.Longs;
 import com.google.common.primitives.UnsignedBytes;

 /**
@ -195,52 +194,43 @@ abstract class FastByteComparisons {
            length1 == length2) {
          return 0;
        }
+        final int stride = 8;
        int minLength = Math.min(length1, length2);
-        int minWords = minLength / Longs.BYTES;
+        int strideLimit = minLength & ~(stride - 1);
        int offset1Adj = offset1 + BYTE_ARRAY_BASE_OFFSET;
        int offset2Adj = offset2 + BYTE_ARRAY_BASE_OFFSET;
+        int i;

        /*
         * Compare 8 bytes at a time. Benchmarking shows comparing 8 bytes at a
         * time is no slower than comparing 4 bytes at a time even on 32-bit.
         * On the other hand, it is substantially faster on 64-bit.
         */
-        for (int i = 0; i < minWords * Longs.BYTES; i += Longs.BYTES) {
+        for (i = 0; i < strideLimit; i += stride) {
          long lw = theUnsafe.getLong(buffer1, offset1Adj + (long) i);
          long rw = theUnsafe.getLong(buffer2, offset2Adj + (long) i);
-          long diff = lw ^ rw;

-          if (diff != 0) {
+          if (lw != rw) {
            if (!littleEndian) {
              return lessThanUnsigned(lw, rw) ? -1 : 1;
            }

-            // Use binary search
-            int n = 0;
-            int y;
-            int x = (int) diff;
-            if (x == 0) {
-              x = (int) (diff >>> 32);
-              n = 32;
-            }
-
-            y = x << 16;
-            if (y == 0) {
-              n += 16;
-            } else {
-              x = y;
-            }
-
-            y = x << 8;
-            if (y == 0) {
-              n += 8;
-            }
-            return (int) (((lw >>> n) & 0xFFL) - ((rw >>> n) & 0xFFL));
+            /*
+             * We want to compare only the first index where left[index] !=
+             * right[index]. This corresponds to the least significant nonzero
+             * byte in lw ^ rw, since lw and rw are little-endian.
+             * Long.numberOfTrailingZeros(diff) tells us the least significant
+             * nonzero bit, and zeroing out the first three bits of L.nTZ gives
+             * us the shift to get that least significant nonzero byte. This
+             * comparison logic is based on UnsignedBytes from Guava v21
+             */
+            int n = Long.numberOfTrailingZeros(lw ^ rw) & ~0x7;
+            return ((int) ((lw >>> n) & 0xFF)) - ((int) ((rw >>> n) & 0xFF));
          }
        }

        // The epilogue to cover the last (minLength % 8) elements.
-        for (int i = minWords * Longs.BYTES; i < minLength; i++) {
+        for (; i < minLength; i++) {
          int result = UnsignedBytes.compare(
              buffer1[offset1 + i],
              buffer2[offset2 + i]);