HBASE-15357 TableInputFormatBase getSplitKey does not handle signed bytes correctly (Nathan Schile)
This commit is contained in:
parent
730b077666
commit
c236409c3b
|
@ -24,6 +24,7 @@ import java.net.InetAddress;
|
||||||
import java.net.InetSocketAddress;
|
import java.net.InetSocketAddress;
|
||||||
import java.net.UnknownHostException;
|
import java.net.UnknownHostException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
|
import java.util.Arrays;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
|
||||||
|
@ -389,14 +390,19 @@ extends InputFormat<ImmutableBytesWritable, Result> {
|
||||||
// if the current region size is large than the data skew threshold,
|
// if the current region size is large than the data skew threshold,
|
||||||
// split the region into two MapReduce input splits.
|
// split the region into two MapReduce input splits.
|
||||||
byte[] splitKey = getSplitKey(ts.getStartRow(), ts.getEndRow(), isTextKey);
|
byte[] splitKey = getSplitKey(ts.getStartRow(), ts.getEndRow(), isTextKey);
|
||||||
|
if (Arrays.equals(ts.getEndRow(), splitKey)) {
|
||||||
|
// Not splitting since the end key is the same as the split key
|
||||||
|
resultList.add(ts);
|
||||||
|
} else {
|
||||||
//Set the size of child TableSplit as 1/2 of the region size. The exact size of the
|
//Set the size of child TableSplit as 1/2 of the region size. The exact size of the
|
||||||
// MapReduce input splits is not far off.
|
// MapReduce input splits is not far off.
|
||||||
TableSplit t1 = new TableSplit(tableName, scan, ts.getStartRow(), splitKey, regionLocation,
|
TableSplit t1 = new TableSplit(tableName, scan, ts.getStartRow(), splitKey,
|
||||||
encodedRegionName, regionSize / 2);
|
regionLocation, regionSize / 2);
|
||||||
TableSplit t2 = new TableSplit(tableName, scan, splitKey, ts.getEndRow(), regionLocation,
|
TableSplit t2 = new TableSplit(tableName, scan, splitKey, ts.getEndRow(), regionLocation,
|
||||||
encodedRegionName, regionSize - regionSize / 2);
|
regionSize - regionSize / 2);
|
||||||
resultList.add(t1);
|
resultList.add(t1);
|
||||||
resultList.add(t2);
|
resultList.add(t2);
|
||||||
|
}
|
||||||
count++;
|
count++;
|
||||||
} else if (regionSize >= average) {
|
} else if (regionSize >= average) {
|
||||||
// if the region size between average size and data skew threshold size,
|
// if the region size between average size and data skew threshold size,
|
||||||
|
@ -432,11 +438,40 @@ extends InputFormat<ImmutableBytesWritable, Result> {
|
||||||
* select a split point in the region. The selection of the split point is based on an uniform
|
* select a split point in the region. The selection of the split point is based on an uniform
|
||||||
* distribution assumption for the keys in a region.
|
* distribution assumption for the keys in a region.
|
||||||
* Here are some examples:
|
* Here are some examples:
|
||||||
* startKey: aaabcdefg endKey: aaafff split point: aaad
|
*
|
||||||
* startKey: 111000 endKey: 1125790 split point: 111b
|
* <table>
|
||||||
* startKey: 1110 endKey: 1120 split point: 111_
|
* <tr>
|
||||||
* startKey: binary key { 13, -19, 126, 127 }, endKey: binary key { 13, -19, 127, 0 },
|
* <th>start key</th>
|
||||||
* split point: binary key { 13, -19, 127, -64 }
|
* <th>end key</th>
|
||||||
|
* <th>is text</th>
|
||||||
|
* <th>split point</th>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>'a', 'a', 'a', 'b', 'c', 'd', 'e', 'f', 'g'</td>
|
||||||
|
* <td>'a', 'a', 'a', 'f', 'f', 'f'</td>
|
||||||
|
* <td>true</td>
|
||||||
|
* <td>'a', 'a', 'a', 'd', 'd', -78, 50, -77, 51</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>'1', '1', '1', '0', '0', '0'</td>
|
||||||
|
* <td>'1', '1', '2', '5', '7', '9', '0'</td>
|
||||||
|
* <td>true</td>
|
||||||
|
* <td>'1', '1', '1', -78, -77, -76, -104</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>'1', '1', '1', '0'</td>
|
||||||
|
* <td>'1', '1', '2', '0'</td>
|
||||||
|
* <td>true</td>
|
||||||
|
* <td>'1', '1', '1', -80</td>
|
||||||
|
* </tr>
|
||||||
|
* <tr>
|
||||||
|
* <td>13, -19, 126, 127</td>
|
||||||
|
* <td>13, -19, 127, 0</td>
|
||||||
|
* <td>false</td>
|
||||||
|
* <td>13, -19, 126, -65</td>
|
||||||
|
* </tr>
|
||||||
|
* </table>
|
||||||
|
*
|
||||||
* Set this function as "public static", make it easier for test.
|
* Set this function as "public static", make it easier for test.
|
||||||
*
|
*
|
||||||
* @param start Start key of the region
|
* @param start Start key of the region
|
||||||
|
@ -455,8 +490,8 @@ extends InputFormat<ImmutableBytesWritable, Result> {
|
||||||
upperLimitByte = '~';
|
upperLimitByte = '~';
|
||||||
lowerLimitByte = ' ';
|
lowerLimitByte = ' ';
|
||||||
} else {
|
} else {
|
||||||
upperLimitByte = Byte.MAX_VALUE;
|
upperLimitByte = -1;
|
||||||
lowerLimitByte = Byte.MIN_VALUE;
|
lowerLimitByte = 0;
|
||||||
}
|
}
|
||||||
// For special case
|
// For special case
|
||||||
// Example 1 : startkey=null, endkey="hhhqqqwww", splitKey="h"
|
// Example 1 : startkey=null, endkey="hhhqqqwww", splitKey="h"
|
||||||
|
@ -475,52 +510,7 @@ extends InputFormat<ImmutableBytesWritable, Result> {
|
||||||
}
|
}
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
// A list to store bytes in split key
|
return Bytes.split(start, end, false, 1)[1];
|
||||||
List resultBytesList = new ArrayList();
|
|
||||||
int maxLength = start.length > end.length ? start.length : end.length;
|
|
||||||
for (int i = 0; i < maxLength; i++) {
|
|
||||||
//calculate the midpoint byte between the first difference
|
|
||||||
//for example: "11ae" and "11chw", the midpoint is "11b"
|
|
||||||
//another example: "11ae" and "11bhw", the first different byte is 'a' and 'b',
|
|
||||||
// there is no midpoint between 'a' and 'b', so we need to check the next byte.
|
|
||||||
if (start[i] == end[i]) {
|
|
||||||
resultBytesList.add(start[i]);
|
|
||||||
//For special case like: startKey="aaa", endKey="aaaz", splitKey="aaaM"
|
|
||||||
if (i + 1 == start.length) {
|
|
||||||
resultBytesList.add((byte) ((lowerLimitByte + end[i + 1]) / 2));
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
//if the two bytes differ by 1, like ['a','b'], We need to check the next byte to find
|
|
||||||
// the midpoint.
|
|
||||||
if ((int)end[i] - (int)start[i] == 1) {
|
|
||||||
//get next byte after the first difference
|
|
||||||
byte startNextByte = (i + 1 < start.length) ? start[i + 1] : lowerLimitByte;
|
|
||||||
byte endNextByte = (i + 1 < end.length) ? end[i + 1] : lowerLimitByte;
|
|
||||||
int byteRange = (upperLimitByte - startNextByte) + (endNextByte - lowerLimitByte) + 1;
|
|
||||||
int halfRange = byteRange / 2;
|
|
||||||
if ((int)startNextByte + halfRange > (int)upperLimitByte) {
|
|
||||||
resultBytesList.add(end[i]);
|
|
||||||
resultBytesList.add((byte) (startNextByte + halfRange - upperLimitByte +
|
|
||||||
lowerLimitByte));
|
|
||||||
} else {
|
|
||||||
resultBytesList.add(start[i]);
|
|
||||||
resultBytesList.add((byte) (startNextByte + halfRange));
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
//calculate the midpoint key by the fist different byte (normal case),
|
|
||||||
// like "11ae" and "11chw", the midpoint is "11b"
|
|
||||||
resultBytesList.add((byte) ((start[i] + end[i]) / 2));
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
//transform the List of bytes to byte[]
|
|
||||||
byte[] result = new byte[resultBytesList.size()];
|
|
||||||
for (int k = 0; k < resultBytesList.size(); k++) {
|
|
||||||
result[k] = (byte) resultBytesList.get(k);
|
|
||||||
}
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -125,48 +125,45 @@ public class TestTableInputFormatScan1 extends TestTableInputFormatScanBase {
|
||||||
@Test
|
@Test
|
||||||
public void testGetSplitsPoint() throws IOException, InterruptedException,
|
public void testGetSplitsPoint() throws IOException, InterruptedException,
|
||||||
ClassNotFoundException {
|
ClassNotFoundException {
|
||||||
// Test Case 1: "aaabcdef" and "aaaff", split point is "aaad".
|
|
||||||
byte[] start1 = { 'a', 'a', 'a', 'b', 'c', 'd', 'e', 'f' };
|
byte[] start1 = { 'a', 'a', 'a', 'b', 'c', 'd', 'e', 'f' };
|
||||||
byte[] end1 = { 'a', 'a', 'a', 'f', 'f' };
|
byte[] end1 = { 'a', 'a', 'a', 'f', 'f' };
|
||||||
byte[] splitPoint1 = { 'a', 'a', 'a', 'd' };
|
byte[] splitPoint1 = { 'a', 'a', 'a', 'd', 'd', -78, 50, -77 };
|
||||||
testGetSplitKey(start1, end1, splitPoint1, true);
|
testGetSplitKey(start1, end1, splitPoint1, true);
|
||||||
|
|
||||||
// Test Case 2: "111000" and "1125790", split point is "111b".
|
|
||||||
byte[] start2 = { '1', '1', '1', '0', '0', '0' };
|
byte[] start2 = { '1', '1', '1', '0', '0', '0' };
|
||||||
byte[] end2 = { '1', '1', '2', '5', '7', '9', '0' };
|
byte[] end2 = { '1', '1', '2', '5', '7', '9', '0' };
|
||||||
byte[] splitPoint2 = { '1', '1', '1', 'b' };
|
byte[] splitPoint2 = { '1', '1', '1', -78, -77, -76, -104 };
|
||||||
testGetSplitKey(start2, end2, splitPoint2, true);
|
testGetSplitKey(start2, end2, splitPoint2, true);
|
||||||
|
|
||||||
// Test Case 3: "aaaaaa" and "aab", split point is "aaap".
|
|
||||||
byte[] start3 = { 'a', 'a', 'a', 'a', 'a', 'a' };
|
byte[] start3 = { 'a', 'a', 'a', 'a', 'a', 'a' };
|
||||||
byte[] end3 = { 'a', 'a', 'b' };
|
byte[] end3 = { 'a', 'a', 'b' };
|
||||||
byte[] splitPoint3 = { 'a', 'a', 'a', 'p' };
|
byte[] splitPoint3 = { 'a', 'a', 'a', -80, -80, -80 };
|
||||||
testGetSplitKey(start3, end3, splitPoint3, true);
|
testGetSplitKey(start3, end3, splitPoint3, true);
|
||||||
|
|
||||||
// Test Case 4: "aaa" and "aaaz", split point is "aaaM".
|
|
||||||
byte[] start4 = { 'a', 'a', 'a' };
|
byte[] start4 = { 'a', 'a', 'a' };
|
||||||
byte[] end4 = { 'a', 'a', 'a', 'z' };
|
byte[] end4 = { 'a', 'a', 'a', 'z' };
|
||||||
byte[] splitPoint4 = { 'a', 'a', 'a', 'M' };
|
byte[] splitPoint4 = { 'a', 'a', 'a', '=' };
|
||||||
testGetSplitKey(start4, end4, splitPoint4, true);
|
testGetSplitKey(start4, end4, splitPoint4, true);
|
||||||
|
|
||||||
// Test Case 5: "aaa" and "aaba", split point is "aaap".
|
|
||||||
byte[] start5 = { 'a', 'a', 'a' };
|
byte[] start5 = { 'a', 'a', 'a' };
|
||||||
byte[] end5 = { 'a', 'a', 'b', 'a' };
|
byte[] end5 = { 'a', 'a', 'b', 'a' };
|
||||||
byte[] splitPoint5 = { 'a', 'a', 'a', 'p' };
|
byte[] splitPoint5 = { 'a', 'a', 'a', -80 };
|
||||||
testGetSplitKey(start5, end5, splitPoint5, true);
|
testGetSplitKey(start5, end5, splitPoint5, true);
|
||||||
|
|
||||||
// Test Case 6: empty key and "hhhqqqwww", split point is "h"
|
// Test Case 6: empty key and "hhhqqqwww", split point is "h"
|
||||||
byte[] start6 = {};
|
byte[] start6 = {};
|
||||||
byte[] end6 = { 'h', 'h', 'h', 'q', 'q', 'q', 'w', 'w' };
|
byte[] end6 = { 'h', 'h', 'h', 'q', 'q', 'q', 'w', 'w' };
|
||||||
byte[] splitPoint6 = { 'h' };
|
byte[] splitPointText6 = { 'h' };
|
||||||
testGetSplitKey(start6, end6, splitPoint6, true);
|
byte[] splitPointBinary6 = { 104 };
|
||||||
|
testGetSplitKey(start6, end6, splitPointText6, true);
|
||||||
|
testGetSplitKey(start6, end6, splitPointBinary6, false);
|
||||||
|
|
||||||
// Test Case 7: "ffffaaa" and empty key, split point depends on the mode we choose(text key or
|
// Test Case 7: "ffffaaa" and empty key, split point depends on the mode we choose(text key or
|
||||||
// binary key).
|
// binary key).
|
||||||
byte[] start7 = { 'f', 'f', 'f', 'f', 'a', 'a', 'a' };
|
byte[] start7 = { 'f', 'f', 'f', 'f', 'a', 'a', 'a' };
|
||||||
byte[] end7 = {};
|
byte[] end7 = {};
|
||||||
byte[] splitPointText7 = { 'f', '~', '~', '~', '~', '~', '~' };
|
byte[] splitPointText7 = { 'f', '~', '~', '~', '~', '~', '~' };
|
||||||
byte[] splitPointBinary7 = { 'f', 127, 127, 127, 127, 127, 127 };
|
byte[] splitPointBinary7 = { 'f', -1, -1, -1, -1, -1, -1 };
|
||||||
testGetSplitKey(start7, end7, splitPointText7, true);
|
testGetSplitKey(start7, end7, splitPointText7, true);
|
||||||
testGetSplitKey(start7, end7, splitPointBinary7, false);
|
testGetSplitKey(start7, end7, splitPointBinary7, false);
|
||||||
|
|
||||||
|
@ -182,8 +179,22 @@ public class TestTableInputFormatScan1 extends TestTableInputFormatScanBase {
|
||||||
// Test Case 9: Binary Key example
|
// Test Case 9: Binary Key example
|
||||||
byte[] start9 = { 13, -19, 126, 127 };
|
byte[] start9 = { 13, -19, 126, 127 };
|
||||||
byte[] end9 = { 13, -19, 127, 0 };
|
byte[] end9 = { 13, -19, 127, 0 };
|
||||||
byte[] splitPoint9 = { 13, -19, 127, -64 };
|
byte[] splitPoint9 = { 13, -19, 126, -65 };
|
||||||
testGetSplitKey(start9, end9, splitPoint9, false);
|
testGetSplitKey(start9, end9, splitPoint9, false);
|
||||||
|
|
||||||
|
// Test Case 10: Binary key split when the start key is an unsigned byte and the end byte is a
|
||||||
|
// signed byte
|
||||||
|
byte[] start10 = { 'x' };
|
||||||
|
byte[] end10 = { -128 };
|
||||||
|
byte[] splitPoint10 = { '|' };
|
||||||
|
testGetSplitKey(start10, end10, splitPoint10, false);
|
||||||
|
|
||||||
|
// Test Case 11: Binary key split when the start key is an signed byte and the end byte is a
|
||||||
|
// signed byte
|
||||||
|
byte[] start11 = { -100 };
|
||||||
|
byte[] end11 = { -90 };
|
||||||
|
byte[] splitPoint11 = { -95 };
|
||||||
|
testGetSplitKey(start11, end11, splitPoint11, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue