HBASE-2302 Optimize M-R by bulk excluding regions - less InputSplit-s to avoid traffic on region servers when performing M-R on a subset of the table
git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@922076 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
f3f7b0622f
commit
0bac7d5b31
|
@ -422,6 +422,9 @@ Release 0.21.0 - Unreleased
|
||||||
HBASE-2267 More improvements to the Maven build (Lars Francke via Stack)
|
HBASE-2267 More improvements to the Maven build (Lars Francke via Stack)
|
||||||
HBASE-2174 Stop from resolving HRegionServer addresses to names using DNS on
|
HBASE-2174 Stop from resolving HRegionServer addresses to names using DNS on
|
||||||
every heartbeat (Karthik Ranganathan via Stack)
|
every heartbeat (Karthik Ranganathan via Stack)
|
||||||
|
HBASE-2302 Optimize M-R by bulk excluding regions - less InputSplit-s to
|
||||||
|
avoid traffic on region servers when performing M-R on a subset
|
||||||
|
of the table (Kay Kay via Stack)
|
||||||
|
|
||||||
NEW FEATURES
|
NEW FEATURES
|
||||||
HBASE-1961 HBase EC2 scripts
|
HBASE-1961 HBase EC2 scripts
|
||||||
|
|
|
@ -281,6 +281,9 @@ extends InputFormat<ImmutableBytesWritable, Result> {
|
||||||
int count = 0;
|
int count = 0;
|
||||||
List<InputSplit> splits = new ArrayList<InputSplit>(keys.getFirst().length);
|
List<InputSplit> splits = new ArrayList<InputSplit>(keys.getFirst().length);
|
||||||
for (int i = 0; i < keys.getFirst().length; i++) {
|
for (int i = 0; i < keys.getFirst().length; i++) {
|
||||||
|
if ( !includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
String regionLocation = table.getRegionLocation(keys.getFirst()[i]).
|
String regionLocation = table.getRegionLocation(keys.getFirst()[i]).
|
||||||
getServerAddress().getHostname();
|
getServerAddress().getHostname();
|
||||||
byte[] startRow = scan.getStartRow();
|
byte[] startRow = scan.getStartRow();
|
||||||
|
@ -307,6 +310,34 @@ extends InputFormat<ImmutableBytesWritable, Result> {
|
||||||
return splits;
|
return splits;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* Test if the given region is to be included in the InputSplit while splitting
|
||||||
|
* the regions of a table.
|
||||||
|
* <p>
|
||||||
|
* This optimization is effective when there is a specific reasoning to exclude an entire region from the M-R job,
|
||||||
|
* (and hence, not contributing to the InputSplit), given the start and end keys of the same. <br>
|
||||||
|
* Useful when we need to remember the last-processed top record and revisit the [last, current) interval for M-R processing,
|
||||||
|
* continuously. In addition to reducing InputSplits, reduces the load on the region server as well, due to the ordering of the keys.
|
||||||
|
* <br>
|
||||||
|
* <br>
|
||||||
|
* Note: It is possible that <code>endKey.length() == 0 </code> , for the last (recent) region.
|
||||||
|
* <br>
|
||||||
|
* Override this method, if you want to bulk exclude regions altogether from M-R. By default, no region is excluded( i.e. all regions are included).
|
||||||
|
*
|
||||||
|
*
|
||||||
|
* @param startKey Start key of the region
|
||||||
|
* @param endKey End key of the region
|
||||||
|
* @return true, if this region needs to be included as part of the input (default).
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
protected boolean includeRegionInSplit(final byte[] startKey, final byte [] endKey) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Allows subclasses to get the {@link HTable}.
|
* Allows subclasses to get the {@link HTable}.
|
||||||
*/
|
*/
|
||||||
|
|
Loading…
Reference in New Issue