From 8c6ddc1aa5497a38018fdcf100bd33b385ca2c84 Mon Sep 17 00:00:00 2001 From: xiaowen147 Date: Fri, 20 Oct 2017 02:18:17 +0800 Subject: [PATCH] HBASE-19051 Add new split algorithm for num string Signed-off-by: tedyu Signed-off-by: Mike Drob --- .../hadoop/hbase/util/RegionSplitter.java | 80 +++++++++++++++---- .../hadoop/hbase/util/TestRegionSplitter.java | 74 +++++++++++++++-- 2 files changed, 131 insertions(+), 23 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/RegionSplitter.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/RegionSplitter.java index 3ee593afca8..06bccd13abd 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/util/RegionSplitter.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/util/RegionSplitter.java @@ -274,6 +274,12 @@ public class RegionSplitter { *
  • bin/hbase org.apache.hadoop.hbase.util.RegionSplitter -c 60 -f test:rs * myTable HexStringSplit * + *
  • create a table named 'myTable' with 50 pre-split regions, + * assuming the keys are decimal-encoded ASCII: + *
      + *
    • bin/hbase org.apache.hadoop.hbase.util.RegionSplitter -c 50 + * myTable DecimalStringSplit + *
    *
  • perform a rolling split of 'myTable' (i.e. 60 => 120 regions), # 2 * outstanding splits at a time, assuming keys are uniformly distributed * bytes: @@ -283,9 +289,9 @@ public class RegionSplitter { * * * - * There are two SplitAlgorithms built into RegionSplitter, HexStringSplit - * and UniformSplit. These are different strategies for choosing region - * boundaries. See their source code for details. + * There are three SplitAlgorithms built into RegionSplitter, HexStringSplit, + * DecimalStringSplit, and UniformSplit. These are different strategies for + * choosing region boundaries. See their source code for details. * * @param args * Usage: RegionSplitter <TABLE> <SPLITALGORITHM> @@ -353,9 +359,10 @@ public class RegionSplitter { if (2 != cmd.getArgList().size() || !oneOperOnly || cmd.hasOption("h")) { new HelpFormatter().printHelp("RegionSplitter \n"+ "SPLITALGORITHM is a java class name of a class implementing " + - "SplitAlgorithm, or one of the special strings HexStringSplit " + - "or UniformSplit, which are built-in split algorithms. " + + "SplitAlgorithm, or one of the special strings HexStringSplit or " + + "DecimalStringSplit or UniformSplit, which are built-in split algorithms. " + "HexStringSplit treats keys as hexadecimal ASCII, and " + + "DecimalStringSplit treats keys as decimal ASCII, and " + "UniformSplit treats keys as arbitrary bytes.", opt); return; } @@ -660,6 +667,8 @@ public class RegionSplitter { // their simple class name instead of a fully qualified class name. if(splitClassName.equals(HexStringSplit.class.getSimpleName())) { splitClass = HexStringSplit.class; + } else if (splitClassName.equals(DecimalStringSplit.class.getSimpleName())) { + splitClass = DecimalStringSplit.class; } else if (splitClassName.equals(UniformSplit.class.getSimpleName())) { splitClass = UniformSplit.class; } else { @@ -893,15 +902,52 @@ public class RegionSplitter { * Since this split algorithm uses hex strings as keys, it is easy to read & * write in the shell but takes up more space and may be non-intuitive. */ - public static class HexStringSplit implements SplitAlgorithm { + public static class HexStringSplit extends NumberStringSplit { final static String DEFAULT_MIN_HEX = "00000000"; final static String DEFAULT_MAX_HEX = "FFFFFFFF"; + final static int RADIX_HEX = 16; - String firstRow = DEFAULT_MIN_HEX; - BigInteger firstRowInt = BigInteger.ZERO; - String lastRow = DEFAULT_MAX_HEX; - BigInteger lastRowInt = new BigInteger(lastRow, 16); - int rowComparisonLength = lastRow.length(); + public HexStringSplit() { + super(DEFAULT_MIN_HEX, DEFAULT_MAX_HEX, RADIX_HEX); + } + + } + + /** + * The format of a DecimalStringSplit region boundary is the ASCII representation of + * reversed sequential number, or any other uniformly distributed decimal value. + * Row are decimal-encoded long values in the range + * "00000000" => "99999999" and are left-padded with zeros to keep the + * same order lexicographically as if they were binary. + */ + public static class DecimalStringSplit extends NumberStringSplit { + final static String DEFAULT_MIN_DEC = "00000000"; + final static String DEFAULT_MAX_DEC = "99999999"; + final static int RADIX_DEC = 10; + + public DecimalStringSplit() { + super(DEFAULT_MIN_DEC, DEFAULT_MAX_DEC, RADIX_DEC); + } + + } + + public abstract static class NumberStringSplit implements SplitAlgorithm { + + String firstRow; + BigInteger firstRowInt; + String lastRow; + BigInteger lastRowInt; + int rowComparisonLength; + int radix; + + NumberStringSplit(String minRow, String maxRow, int radix) { + this.firstRow = minRow; + this.lastRow = maxRow; + this.radix = radix; + this.firstRowInt = BigInteger.ZERO; + this.lastRowInt = new BigInteger(lastRow, this.radix); + this.rowComparisonLength = lastRow.length(); + } public byte[] split(byte[] start, byte[] end) { BigInteger s = convertToBigInteger(start); @@ -973,18 +1019,18 @@ public class RegionSplitter { public void setFirstRow(String userInput) { firstRow = userInput; - firstRowInt = new BigInteger(firstRow, 16); + firstRowInt = new BigInteger(firstRow, radix); } public void setLastRow(String userInput) { lastRow = userInput; - lastRowInt = new BigInteger(lastRow, 16); + lastRowInt = new BigInteger(lastRow, radix); // Precondition: lastRow > firstRow, so last's length is the greater rowComparisonLength = lastRow.length(); } public byte[] strToRow(String in) { - return convertToByte(new BigInteger(in, 16)); + return convertToByte(new BigInteger(in, radix)); } public String rowToStr(byte[] row) { @@ -1037,8 +1083,8 @@ public class RegionSplitter { * @param pad padding length * @return byte corresponding to input BigInteger */ - public static byte[] convertToByte(BigInteger bigInteger, int pad) { - String bigIntegerString = bigInteger.toString(16); + public byte[] convertToByte(BigInteger bigInteger, int pad) { + String bigIntegerString = bigInteger.toString(radix); bigIntegerString = StringUtils.leftPad(bigIntegerString, pad, '0'); return Bytes.toBytes(bigIntegerString); } @@ -1060,7 +1106,7 @@ public class RegionSplitter { * @return the corresponding BigInteger */ public BigInteger convertToBigInteger(byte[] row) { - return (row.length > 0) ? new BigInteger(Bytes.toString(row), 16) + return (row.length > 0) ? new BigInteger(Bytes.toString(row), radix) : BigInteger.ZERO; } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestRegionSplitter.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestRegionSplitter.java index aa42616832f..9643443471b 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestRegionSplitter.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/TestRegionSplitter.java @@ -40,6 +40,7 @@ import org.apache.hadoop.hbase.client.Table; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.testclassification.MiscTests; import org.apache.hadoop.hbase.util.RegionSplitter.HexStringSplit; +import org.apache.hadoop.hbase.util.RegionSplitter.DecimalStringSplit; import org.apache.hadoop.hbase.util.RegionSplitter.SplitAlgorithm; import org.apache.hadoop.hbase.util.RegionSplitter.UniformSplit; import org.junit.AfterClass; @@ -143,7 +144,7 @@ public class TestRegionSplitter { byte[][] twoRegionsSplits = splitter.split(2); assertEquals(1, twoRegionsSplits.length); - assertArrayEquals(twoRegionsSplits[0], "80000000".getBytes()); + assertArrayEquals("80000000".getBytes(), twoRegionsSplits[0]); byte[][] threeRegionsSplits = splitter.split(3); assertEquals(2, threeRegionsSplits.length); @@ -163,21 +164,73 @@ public class TestRegionSplitter { // Halfway between 00... and 20... should be 10... splitPoint = splitter.split(firstRow, "20000000".getBytes()); - assertArrayEquals(splitPoint, "10000000".getBytes()); + assertArrayEquals("10000000".getBytes(), splitPoint); // Halfway between df... and ff... should be ef.... splitPoint = splitter.split("dfffffff".getBytes(), lastRow); - assertArrayEquals(splitPoint,"efffffff".getBytes()); + assertArrayEquals("efffffff".getBytes(), splitPoint); // Check splitting region with multiple mappers per region byte[][] splits = splitter.split("00000000".getBytes(), "30000000".getBytes(), 3, false); assertEquals(2, splits.length); - assertArrayEquals(splits[0], "10000000".getBytes()); - assertArrayEquals(splits[1], "20000000".getBytes()); + assertArrayEquals("10000000".getBytes(), splits[0]); + assertArrayEquals("20000000".getBytes(), splits[1]); splits = splitter.split("00000000".getBytes(), "20000000".getBytes(), 2, true); assertEquals(3, splits.length); - assertArrayEquals(splits[1], "10000000".getBytes()); + assertArrayEquals("10000000".getBytes(), splits[1]); + } + + /** + * Unit tests for the DecimalStringSplit algorithm. Makes sure it divides up the + * space of keys in the way that we expect. + */ + @Test + public void unitTestDecimalStringSplit() { + DecimalStringSplit splitter = new DecimalStringSplit(); + // Check splitting while starting from scratch + + byte[][] twoRegionsSplits = splitter.split(2); + assertEquals(1, twoRegionsSplits.length); + assertArrayEquals("50000000".getBytes(), twoRegionsSplits[0]); + + byte[][] threeRegionsSplits = splitter.split(3); + assertEquals(2, threeRegionsSplits.length); + byte[] expectedSplit0 = "33333333".getBytes(); + assertArrayEquals(expectedSplit0, threeRegionsSplits[0]); + byte[] expectedSplit1 = "66666666".getBytes(); + assertArrayEquals(expectedSplit1, threeRegionsSplits[1]); + + // Check splitting existing regions that have start and end points + byte[] splitPoint = splitter.split("10000000".getBytes(), "30000000".getBytes()); + assertArrayEquals("20000000".getBytes(), splitPoint); + + byte[] lastRow = "99999999".getBytes(); + assertArrayEquals(lastRow, splitter.lastRow()); + byte[] firstRow = "00000000".getBytes(); + assertArrayEquals(firstRow, splitter.firstRow()); + + // Halfway between 00... and 20... should be 10... + splitPoint = splitter.split(firstRow, "20000000".getBytes()); + assertArrayEquals("10000000".getBytes(), splitPoint); + + // Halfway between 00... and 19... should be 09... + splitPoint = splitter.split(firstRow, "19999999".getBytes()); + assertArrayEquals("09999999".getBytes(), splitPoint); + + // Halfway between 79... and 99... should be 89.... + splitPoint = splitter.split("79999999".getBytes(), lastRow); + assertArrayEquals("89999999".getBytes(), splitPoint); + + // Check splitting region with multiple mappers per region + byte[][] splits = splitter.split("00000000".getBytes(), "30000000".getBytes(), 3, false); + assertEquals(2, splits.length); + assertArrayEquals("10000000".getBytes(), splits[0]); + assertArrayEquals("20000000".getBytes(), splits[1]); + + splits = splitter.split("00000000".getBytes(), "20000000".getBytes(), 2, true); + assertEquals(3, splits.length); + assertArrayEquals("10000000".getBytes(), splits[1]); } /** @@ -248,6 +301,15 @@ public class TestRegionSplitter { assertFalse(splitFailsPrecondition(algo, "0", "A", 11)); // should be fine assertTrue(splitFailsPrecondition(algo, "0", "A", 12)); // too granular + algo = new DecimalStringSplit(); + assertFalse(splitFailsPrecondition(algo)); // default settings are fine + assertFalse(splitFailsPrecondition(algo, "00", "99")); // custom is fine + assertTrue(splitFailsPrecondition(algo, "99", "00")); // range error + assertTrue(splitFailsPrecondition(algo, "99", "99")); // range error + assertFalse(splitFailsPrecondition(algo, "0", "2", 3)); // should be fine + assertFalse(splitFailsPrecondition(algo, "0", "9", 10)); // should be fine + assertTrue(splitFailsPrecondition(algo, "0", "9", 11)); // too granular + algo = new UniformSplit(); assertFalse(splitFailsPrecondition(algo)); // default settings are fine assertFalse(splitFailsPrecondition(algo, "\\x00", "\\xAA")); // custom is fine