From c236409c3b9fb02bcd26cb632b9cb5b5f7a80652 Mon Sep 17 00:00:00 2001 From: tedyu Date: Fri, 29 Apr 2016 17:08:25 -0700 Subject: [PATCH] HBASE-15357 TableInputFormatBase getSplitKey does not handle signed bytes correctly (Nathan Schile) --- .../hbase/mapreduce/TableInputFormatBase.java | 112 ++++++++---------- .../mapreduce/TestTableInputFormatScan1.java | 39 +++--- 2 files changed, 76 insertions(+), 75 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/TableInputFormatBase.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/TableInputFormatBase.java index 2cde4b99be4..53148dca109 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/TableInputFormatBase.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/TableInputFormatBase.java @@ -24,6 +24,7 @@ import java.net.InetAddress; import java.net.InetSocketAddress; import java.net.UnknownHostException; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -389,14 +390,19 @@ extends InputFormat { // if the current region size is large than the data skew threshold, // split the region into two MapReduce input splits. byte[] splitKey = getSplitKey(ts.getStartRow(), ts.getEndRow(), isTextKey); - //Set the size of child TableSplit as 1/2 of the region size. The exact size of the - // MapReduce input splits is not far off. - TableSplit t1 = new TableSplit(tableName, scan, ts.getStartRow(), splitKey, regionLocation, - encodedRegionName, regionSize / 2); - TableSplit t2 = new TableSplit(tableName, scan, splitKey, ts.getEndRow(), regionLocation, - encodedRegionName, regionSize - regionSize / 2); - resultList.add(t1); - resultList.add(t2); + if (Arrays.equals(ts.getEndRow(), splitKey)) { + // Not splitting since the end key is the same as the split key + resultList.add(ts); + } else { + //Set the size of child TableSplit as 1/2 of the region size. The exact size of the + // MapReduce input splits is not far off. + TableSplit t1 = new TableSplit(tableName, scan, ts.getStartRow(), splitKey, + regionLocation, regionSize / 2); + TableSplit t2 = new TableSplit(tableName, scan, splitKey, ts.getEndRow(), regionLocation, + regionSize - regionSize / 2); + resultList.add(t1); + resultList.add(t2); + } count++; } else if (regionSize >= average) { // if the region size between average size and data skew threshold size, @@ -432,11 +438,40 @@ extends InputFormat { * select a split point in the region. The selection of the split point is based on an uniform * distribution assumption for the keys in a region. * Here are some examples: - * startKey: aaabcdefg endKey: aaafff split point: aaad - * startKey: 111000 endKey: 1125790 split point: 111b - * startKey: 1110 endKey: 1120 split point: 111_ - * startKey: binary key { 13, -19, 126, 127 }, endKey: binary key { 13, -19, 127, 0 }, - * split point: binary key { 13, -19, 127, -64 } + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + * + *
start keyend keyis textsplit point
'a', 'a', 'a', 'b', 'c', 'd', 'e', 'f', 'g''a', 'a', 'a', 'f', 'f', 'f'true'a', 'a', 'a', 'd', 'd', -78, 50, -77, 51
'1', '1', '1', '0', '0', '0''1', '1', '2', '5', '7', '9', '0'true'1', '1', '1', -78, -77, -76, -104
'1', '1', '1', '0''1', '1', '2', '0'true'1', '1', '1', -80
13, -19, 126, 12713, -19, 127, 0false13, -19, 126, -65
+ * * Set this function as "public static", make it easier for test. * * @param start Start key of the region @@ -455,8 +490,8 @@ extends InputFormat { upperLimitByte = '~'; lowerLimitByte = ' '; } else { - upperLimitByte = Byte.MAX_VALUE; - lowerLimitByte = Byte.MIN_VALUE; + upperLimitByte = -1; + lowerLimitByte = 0; } // For special case // Example 1 : startkey=null, endkey="hhhqqqwww", splitKey="h" @@ -475,52 +510,7 @@ extends InputFormat { } return result; } - // A list to store bytes in split key - List resultBytesList = new ArrayList(); - int maxLength = start.length > end.length ? start.length : end.length; - for (int i = 0; i < maxLength; i++) { - //calculate the midpoint byte between the first difference - //for example: "11ae" and "11chw", the midpoint is "11b" - //another example: "11ae" and "11bhw", the first different byte is 'a' and 'b', - // there is no midpoint between 'a' and 'b', so we need to check the next byte. - if (start[i] == end[i]) { - resultBytesList.add(start[i]); - //For special case like: startKey="aaa", endKey="aaaz", splitKey="aaaM" - if (i + 1 == start.length) { - resultBytesList.add((byte) ((lowerLimitByte + end[i + 1]) / 2)); - break; - } - } else { - //if the two bytes differ by 1, like ['a','b'], We need to check the next byte to find - // the midpoint. - if ((int)end[i] - (int)start[i] == 1) { - //get next byte after the first difference - byte startNextByte = (i + 1 < start.length) ? start[i + 1] : lowerLimitByte; - byte endNextByte = (i + 1 < end.length) ? end[i + 1] : lowerLimitByte; - int byteRange = (upperLimitByte - startNextByte) + (endNextByte - lowerLimitByte) + 1; - int halfRange = byteRange / 2; - if ((int)startNextByte + halfRange > (int)upperLimitByte) { - resultBytesList.add(end[i]); - resultBytesList.add((byte) (startNextByte + halfRange - upperLimitByte + - lowerLimitByte)); - } else { - resultBytesList.add(start[i]); - resultBytesList.add((byte) (startNextByte + halfRange)); - } - } else { - //calculate the midpoint key by the fist different byte (normal case), - // like "11ae" and "11chw", the midpoint is "11b" - resultBytesList.add((byte) ((start[i] + end[i]) / 2)); - } - break; - } - } - //transform the List of bytes to byte[] - byte[] result = new byte[resultBytesList.size()]; - for (int k = 0; k < resultBytesList.size(); k++) { - result[k] = (byte) resultBytesList.get(k); - } - return result; + return Bytes.split(start, end, false, 1)[1]; } /** diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/mapreduce/TestTableInputFormatScan1.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/mapreduce/TestTableInputFormatScan1.java index 7d8a8951375..99b40b9846b 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/mapreduce/TestTableInputFormatScan1.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/mapreduce/TestTableInputFormatScan1.java @@ -125,48 +125,45 @@ public class TestTableInputFormatScan1 extends TestTableInputFormatScanBase { @Test public void testGetSplitsPoint() throws IOException, InterruptedException, ClassNotFoundException { - // Test Case 1: "aaabcdef" and "aaaff", split point is "aaad". byte[] start1 = { 'a', 'a', 'a', 'b', 'c', 'd', 'e', 'f' }; byte[] end1 = { 'a', 'a', 'a', 'f', 'f' }; - byte[] splitPoint1 = { 'a', 'a', 'a', 'd' }; + byte[] splitPoint1 = { 'a', 'a', 'a', 'd', 'd', -78, 50, -77 }; testGetSplitKey(start1, end1, splitPoint1, true); - // Test Case 2: "111000" and "1125790", split point is "111b". byte[] start2 = { '1', '1', '1', '0', '0', '0' }; byte[] end2 = { '1', '1', '2', '5', '7', '9', '0' }; - byte[] splitPoint2 = { '1', '1', '1', 'b' }; + byte[] splitPoint2 = { '1', '1', '1', -78, -77, -76, -104 }; testGetSplitKey(start2, end2, splitPoint2, true); - // Test Case 3: "aaaaaa" and "aab", split point is "aaap". byte[] start3 = { 'a', 'a', 'a', 'a', 'a', 'a' }; byte[] end3 = { 'a', 'a', 'b' }; - byte[] splitPoint3 = { 'a', 'a', 'a', 'p' }; + byte[] splitPoint3 = { 'a', 'a', 'a', -80, -80, -80 }; testGetSplitKey(start3, end3, splitPoint3, true); - // Test Case 4: "aaa" and "aaaz", split point is "aaaM". byte[] start4 = { 'a', 'a', 'a' }; byte[] end4 = { 'a', 'a', 'a', 'z' }; - byte[] splitPoint4 = { 'a', 'a', 'a', 'M' }; + byte[] splitPoint4 = { 'a', 'a', 'a', '=' }; testGetSplitKey(start4, end4, splitPoint4, true); - // Test Case 5: "aaa" and "aaba", split point is "aaap". byte[] start5 = { 'a', 'a', 'a' }; byte[] end5 = { 'a', 'a', 'b', 'a' }; - byte[] splitPoint5 = { 'a', 'a', 'a', 'p' }; + byte[] splitPoint5 = { 'a', 'a', 'a', -80 }; testGetSplitKey(start5, end5, splitPoint5, true); // Test Case 6: empty key and "hhhqqqwww", split point is "h" byte[] start6 = {}; byte[] end6 = { 'h', 'h', 'h', 'q', 'q', 'q', 'w', 'w' }; - byte[] splitPoint6 = { 'h' }; - testGetSplitKey(start6, end6, splitPoint6, true); + byte[] splitPointText6 = { 'h' }; + byte[] splitPointBinary6 = { 104 }; + testGetSplitKey(start6, end6, splitPointText6, true); + testGetSplitKey(start6, end6, splitPointBinary6, false); // Test Case 7: "ffffaaa" and empty key, split point depends on the mode we choose(text key or // binary key). byte[] start7 = { 'f', 'f', 'f', 'f', 'a', 'a', 'a' }; byte[] end7 = {}; byte[] splitPointText7 = { 'f', '~', '~', '~', '~', '~', '~' }; - byte[] splitPointBinary7 = { 'f', 127, 127, 127, 127, 127, 127 }; + byte[] splitPointBinary7 = { 'f', -1, -1, -1, -1, -1, -1 }; testGetSplitKey(start7, end7, splitPointText7, true); testGetSplitKey(start7, end7, splitPointBinary7, false); @@ -182,8 +179,22 @@ public class TestTableInputFormatScan1 extends TestTableInputFormatScanBase { // Test Case 9: Binary Key example byte[] start9 = { 13, -19, 126, 127 }; byte[] end9 = { 13, -19, 127, 0 }; - byte[] splitPoint9 = { 13, -19, 127, -64 }; + byte[] splitPoint9 = { 13, -19, 126, -65 }; testGetSplitKey(start9, end9, splitPoint9, false); + + // Test Case 10: Binary key split when the start key is an unsigned byte and the end byte is a + // signed byte + byte[] start10 = { 'x' }; + byte[] end10 = { -128 }; + byte[] splitPoint10 = { '|' }; + testGetSplitKey(start10, end10, splitPoint10, false); + + // Test Case 11: Binary key split when the start key is an signed byte and the end byte is a + // signed byte + byte[] start11 = { -100 }; + byte[] end11 = { -90 }; + byte[] splitPoint11 = { -95 }; + testGetSplitKey(start11, end11, splitPoint11, false); } }