HBASE-12223 MultiTableInputFormatBase.getSplits is too slow (Yuanbo Peng)

This commit is contained in:
tedyu 2014-12-18 05:09:01 -08:00
parent 83e4bfaf73
commit 15cf0a6e7b
1 changed files with 62 additions and 43 deletions

View File

@ -46,6 +46,9 @@ import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import java.util.Map;
import java.util.HashMap;
import java.util.Iterator;
/**
* A base for {@link MultiTableInputFormat}s. Receives a list of
* {@link Scan} instances that define the input tables and
@ -129,67 +132,82 @@ public abstract class MultiTableInputFormatBase extends
if (scans.isEmpty()) {
throw new IOException("No scans were provided.");
}
List<InputSplit> splits = new ArrayList<InputSplit>();
Map<TableName, List<Scan>> tableMaps = new HashMap<TableName, List<Scan>>();
for (Scan scan : scans) {
byte[] tableNameBytes = scan.getAttribute(Scan.SCAN_ATTRIBUTES_TABLE_NAME);
if (tableNameBytes == null)
throw new IOException("A scan object did not have a table name");
TableName tableName = TableName.valueOf(tableNameBytes);
List<Scan> scanList = tableMaps.get(tableName);
if (scanList == null) {
scanList = new ArrayList<Scan>();
tableMaps.put(tableName, scanList);
}
scanList.add(scan);
}
List<InputSplit> splits = new ArrayList<InputSplit>();
Iterator iter = tableMaps.entrySet().iterator();
while (iter.hasNext()) {
Map.Entry<TableName, List<Scan>> entry = (Map.Entry<TableName, List<Scan>>) iter.next();
TableName tableName = entry.getKey();
List<Scan> scanList = entry.getValue();
Table table = null;
RegionLocator regionLocator = null;
Connection conn = null;
try {
try{
conn = ConnectionFactory.createConnection(context.getConfiguration());
table = conn.getTable(tableName);
regionLocator = conn.getRegionLocator(tableName);
regionLocator = (RegionLocator) table;
Pair<byte[][], byte[][]> keys = regionLocator.getStartEndKeys();
if (keys == null || keys.getFirst() == null ||
keys.getFirst().length == 0) {
throw new IOException("Expecting at least one region for table : "
+ tableName.getNameAsString());
}
int count = 0;
byte[] startRow = scan.getStartRow();
byte[] stopRow = scan.getStopRow();
RegionSizeCalculator sizeCalculator = new RegionSizeCalculator(
regionLocator, conn.getAdmin());
for (int i = 0; i < keys.getFirst().length; i++) {
if (!includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
continue;
regionLocator, conn.getAdmin());
Pair<byte[][], byte[][]> keys = regionLocator.getStartEndKeys();
for (Scan scan : scanList) {
if (keys == null || keys.getFirst() == null || keys.getFirst().length == 0) {
throw new IOException("Expecting at least one region for table : "
+ tableName.getNameAsString());
}
HRegionLocation hregionLocation = regionLocator.getRegionLocation(
keys.getFirst()[i], false);
String regionHostname = hregionLocation.getHostname();
HRegionInfo regionInfo = hregionLocation.getRegionInfo();
int count = 0;
// determine if the given start and stop keys fall into the range
if ((startRow.length == 0 || keys.getSecond()[i].length == 0 ||
Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) &&
(stopRow.length == 0 ||
Bytes.compareTo(stopRow, keys.getFirst()[i]) > 0)) {
byte[] splitStart =
startRow.length == 0 ||
Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ? keys
.getFirst()[i] : startRow;
byte[] splitStop =
(stopRow.length == 0 || Bytes.compareTo(keys.getSecond()[i],
stopRow) <= 0) && keys.getSecond()[i].length > 0 ? keys
.getSecond()[i] : stopRow;
byte[] startRow = scan.getStartRow();
byte[] stopRow = scan.getStopRow();
long regionSize = sizeCalculator.getRegionSize(regionInfo.getRegionName());
TableSplit split =
new TableSplit(regionLocator.getName(),
scan, splitStart, splitStop, regionHostname, regionSize);
for (int i = 0; i < keys.getFirst().length; i++) {
if (!includeRegionInSplit(keys.getFirst()[i], keys.getSecond()[i])) {
continue;
}
splits.add(split);
if (LOG.isDebugEnabled())
LOG.debug("getSplits: split -> " + (count++) + " -> " + split);
if ((startRow.length == 0 || keys.getSecond()[i].length == 0 ||
Bytes.compareTo(startRow, keys.getSecond()[i]) < 0) &&
(stopRow.length == 0 || Bytes.compareTo(stopRow,
keys.getFirst()[i]) > 0)) {
byte[] splitStart = startRow.length == 0 ||
Bytes.compareTo(keys.getFirst()[i], startRow) >= 0 ?
keys.getFirst()[i] : startRow;
byte[] splitStop = (stopRow.length == 0 ||
Bytes.compareTo(keys.getSecond()[i], stopRow) <= 0) &&
keys.getSecond()[i].length > 0 ?
keys.getSecond()[i] : stopRow;
HRegionLocation hregionLocation = regionLocator.getRegionLocation(
keys.getFirst()[i], false);
String regionHostname = hregionLocation.getHostname();
HRegionInfo regionInfo = hregionLocation.getRegionInfo();
long regionSize = sizeCalculator.getRegionSize(
regionInfo.getRegionName());
TableSplit split = new TableSplit(table.getName(),
scan, splitStart, splitStop, regionHostname, regionSize);
splits.add(split);
if (LOG.isDebugEnabled())
LOG.debug("getSplits: split -> " + (count++) + " -> " + split);
}
}
}
} finally {
@ -198,6 +216,7 @@ public abstract class MultiTableInputFormatBase extends
if (null != conn) conn.close();
}
}
return splits;
}