HBASE-1183 New MR splitting algorithm and other new features need a way to split a key range in N chunks

git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@769076 13f79535-47bb-0310-9956-ffa450edef68
2009-04-27 18:05:34 +00:00 · 2009-04-27 18:05:34 +00:00 · 4a05b0693a
parent b5da244412
commit 4a05b0693a
3 changed files with 137 additions and 0 deletions
--- a/CHANGES.txt
+++ b/CHANGES.txt
@ -167,6 +167,8 @@ Release 0.20.0 - Unreleased
               (Evgeny Ryabitskiy via Stack)
   HBASE-1260  Bytes utility class changes: remove usage of ByteBuffer and
               provide additional ByteBuffer primitives (Jon Gray via Stack)
+   HBASE-1183  New MR splitting algorithm and other new features need a way to
+               split a key range in N chunks (Jon Gray via Stack)

 Release 0.19.0 - 01/21/2009
  INCOMPATIBLE CHANGES
--- a/src/java/org/apache/hadoop/hbase/util/Bytes.java
+++ b/src/java/org/apache/hadoop/hbase/util/Bytes.java
@ -25,6 +25,7 @@ import java.io.IOException;
 import java.io.UnsupportedEncodingException;
 import java.nio.ByteBuffer;
 import java.util.Comparator;
+import java.math.BigInteger;

 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.io.ImmutableBytesWritable;
@ -749,7 +750,107 @@ public class Bytes {
    return result;
  }
  
+  /**
+   * @param a
+   * @param length
+   * @return First <code>length</code> bytes from <code>a</code>
+   */
+  public static byte [] head(final byte [] a, final int length) {
+    if(a.length < length) return null;
+    byte [] result = new byte[length];
+    System.arraycopy(a, 0, result, 0, length);
+    return result;
+  }

+  /**
+   * @param a
+   * @param length
+   * @return Last <code>length</code> bytes from <code>a</code>
+   */
+  public static byte [] tail(final byte [] a, final int length) {
+    if(a.length < length) return null;
+    byte [] result = new byte[length];
+    System.arraycopy(a, a.length - length, result, 0, length);
+    return result;
+  }
+
+  /**
+   * @param a
+   * @param length
+   * @return Value in <code>a</code> plus <code>length</code> prepended 0 bytes
+   */
+  public static byte [] padHead(final byte [] a, final int length) {
+    byte [] padding = new byte[length];
+    for(int i=0;i<length;i++) padding[i] = 0;
+    return add(padding,a);
+  }
+  
+  /**
+   * @param a
+   * @param length
+   * @return Value in <code>a</code> plus <code>length</code> appended 0 bytes
+   */
+  public static byte [] padTail(final byte [] a, final int length) {
+    byte [] padding = new byte[length];
+    for(int i=0;i<length;i++) padding[i] = 0;
+    return add(a,padding);
+  }
+  
+  /**
+   * Split passed range.  Expensive operation relatively.  Uses BigInteger math.
+   * Useful splitting ranges for MapReduce jobs.
+   * @param a Beginning of range
+   * @param b End of range
+   * @param num Number of times to split range.  Pass 1 if you want to split
+   * the range in two; i.e. one split.
+   * @return Array of dividing values
+   */
+  public static byte [][] split(final byte [] a, final byte [] b, final int num) {
+    byte [] aPadded = null;
+    byte [] bPadded = null;
+    if (a.length < b.length) {
+      aPadded = padTail(a,b.length-a.length);
+      bPadded = b;
+    } else if (b.length < a.length) {
+      aPadded = a;
+      bPadded = padTail(b,a.length-b.length);
+    } else {
+      aPadded = a;
+      bPadded = b;
+    }
+    if (compareTo(aPadded,bPadded) > 1) {
+      throw new IllegalArgumentException("b > a");
+    }
+    if (num <= 0) throw new IllegalArgumentException("num cannot be < 0");
+    byte [] prependHeader = {1, 0};
+    BigInteger startBI = new BigInteger(add(prependHeader, aPadded));
+    BigInteger stopBI = new BigInteger(add(prependHeader, bPadded));
+    BigInteger diffBI = stopBI.subtract(startBI);
+    BigInteger splitsBI = BigInteger.valueOf(num + 1);
+    if(diffBI.compareTo(splitsBI) <= 0) return null;
+    BigInteger intervalBI = null;
+    try {
+      intervalBI = diffBI.divide(splitsBI);
+    } catch(Exception e) {
+      return null;
+    }
+
+    byte [][] result = new byte[num+2][];
+    result[0] = a;
+
+    for (int i = 1; i <= num; i++) {
+      BigInteger curBI = startBI.add(intervalBI.multiply(BigInteger.valueOf(i)));
+      byte [] padded = curBI.toByteArray();
+      if (padded[1] == 0)
+        padded = tail(padded,padded.length-2);
+      else
+        padded = tail(padded,padded.length-1);
+      result[i] = padded;
+    }
+    result[num+1] = b;
+    return result;
+  }
+  
  /**
   * @param t
   * @return Array of byte arrays made from passed array of Text
--- a/src/test/org/apache/hadoop/hbase/util/TestBytes.java
+++ b/src/test/org/apache/hadoop/hbase/util/TestBytes.java
@ -24,6 +24,40 @@ import java.util.Arrays;
 import junit.framework.TestCase;

 public class TestBytes extends TestCase {
+  public void testSplit() throws Exception {
+    byte [] lowest = Bytes.toBytes("AAA");
+    byte [] middle = Bytes.toBytes("CCC");
+    byte [] highest = Bytes.toBytes("EEE");
+    byte [][] parts = Bytes.split(lowest, highest, 1);
+    for (int i = 0; i < parts.length; i++) {
+      System.out.println(Bytes.toString(parts[i]));
+    }
+    assertEquals(3, parts.length);
+    assertTrue(Bytes.equals(parts[1], middle));
+    // Now divide into three parts.  Change highest so split is even.
+    highest = Bytes.toBytes("DDD");
+    parts = Bytes.split(lowest, highest, 2);
+    for (int i = 0; i < parts.length; i++) {
+      System.out.println(Bytes.toString(parts[i]));
+    }
+    assertEquals(4, parts.length);
+    // Assert that 3rd part is 'CCC'.
+    assertTrue(Bytes.equals(parts[2], middle));
+  }
+
+  public void testSplit2() throws Exception {
+    // More split tests.
+    byte [] lowest = Bytes.toBytes("http://A");
+    byte [] highest = Bytes.toBytes("http://z");
+    byte [] middle = Bytes.toBytes("http://[");
+    byte [][] parts = Bytes.split(lowest, highest, 1);
+    for (int i = 0; i < parts.length; i++) {
+      System.out.println(Bytes.toString(parts[i]));
+    }
+    assertEquals(2, parts.length);
+    assertTrue(Bytes.equals(parts[1], middle));
+  }
+
  public void testToLong() throws Exception {
    long [] longs = {-1l, 123l, 122232323232l};
    for (int i = 0; i < longs.length; i++) {