HBASE-2302 Optimize M-R by bulk excluding regions - less InputSplit-s to avoid traffic on region servers when performing M-R on a subset of the table

git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@922076 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Stack 2010-03-11 23:51:23 +00:00
parent f3f7b0622f
commit 0bac7d5b31
2 changed files with 35 additions and 1 deletions

View File

@ -422,6 +422,9 @@ Release 0.21.0 - Unreleased
HBASE-2267 More improvements to the Maven build (Lars Francke via Stack) HBASE-2267 More improvements to the Maven build (Lars Francke via Stack)
HBASE-2174 Stop from resolving HRegionServer addresses to names using DNS on HBASE-2174 Stop from resolving HRegionServer addresses to names using DNS on
every heartbeat (Karthik Ranganathan via Stack) every heartbeat (Karthik Ranganathan via Stack)
HBASE-2302 Optimize M-R by bulk excluding regions - less InputSplit-s to
avoid traffic on region servers when performing M-R on a subset
of the table (Kay Kay via Stack)
NEW FEATURES NEW FEATURES
HBASE-1961 HBase EC2 scripts HBASE-1961 HBase EC2 scripts

View File

@ -281,6 +281,9 @@ extends InputFormat<ImmutableBytesWritable, Result> {
int count = 0; int count = 0;
List<InputSplit> splits = new ArrayList<InputSplit>(keys.getFirst().length); List<InputSplit> splits = new ArrayList<InputSplit>(keys.getFirst().length);
for (int i = 0; i < keys.getFirst().length; i++) { for (int i = 0; i < keys.getFirst().length; i++) {
if ( !includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
continue;
}
String regionLocation = table.getRegionLocation(keys.getFirst()[i]). String regionLocation = table.getRegionLocation(keys.getFirst()[i]).
getServerAddress().getHostname(); getServerAddress().getHostname();
byte[] startRow = scan.getStartRow(); byte[] startRow = scan.getStartRow();
@ -307,6 +310,34 @@ extends InputFormat<ImmutableBytesWritable, Result> {
return splits; return splits;
} }
/**
*
*
* Test if the given region is to be included in the InputSplit while splitting
* the regions of a table.
* <p>
* This optimization is effective when there is a specific reasoning to exclude an entire region from the M-R job,
* (and hence, not contributing to the InputSplit), given the start and end keys of the same. <br>
* Useful when we need to remember the last-processed top record and revisit the [last, current) interval for M-R processing,
* continuously. In addition to reducing InputSplits, reduces the load on the region server as well, due to the ordering of the keys.
* <br>
* <br>
* Note: It is possible that <code>endKey.length() == 0 </code> , for the last (recent) region.
* <br>
* Override this method, if you want to bulk exclude regions altogether from M-R. By default, no region is excluded( i.e. all regions are included).
*
*
* @param startKey Start key of the region
* @param endKey End key of the region
* @return true, if this region needs to be included as part of the input (default).
*
*/
protected boolean includeRegionInSplit(final byte[] startKey, final byte [] endKey) {
return true;
}
/** /**
* Allows subclasses to get the {@link HTable}. * Allows subclasses to get the {@link HTable}.
*/ */