From b8ba3f788bac747d8266b90b6966cfdb8435450f Mon Sep 17 00:00:00 2001 From: huaxiangsun Date: Wed, 8 Sep 2021 09:46:34 -0700 Subject: [PATCH] HBASE-26255 Add an option to use region location from meta table in TableSnapshotInputFormat (#3661) Signed-off-by: Anoop Sam John --- .../TableSnapshotInputFormatImpl.java | 63 +++++++++++++++---- .../TestTableSnapshotInputFormat.java | 29 +++++++++ 2 files changed, 80 insertions(+), 12 deletions(-) diff --git a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java index abdd8f47a6e..22c19be7ce2 100644 --- a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java +++ b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/TableSnapshotInputFormatImpl.java @@ -31,10 +31,14 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HDFSBlocksDistribution; import org.apache.hadoop.hbase.HDFSBlocksDistribution.HostAndWeight; +import org.apache.hadoop.hbase.HRegionLocation; import org.apache.hadoop.hbase.PrivateCellUtil; import org.apache.hadoop.hbase.client.ClientSideRegionScanner; +import org.apache.hadoop.hbase.client.Connection; +import org.apache.hadoop.hbase.client.ConnectionFactory; import org.apache.hadoop.hbase.client.IsolationLevel; import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.client.RegionLocator; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Scan; import org.apache.hadoop.hbase.client.TableDescriptor; @@ -101,6 +105,15 @@ public class TableSnapshotInputFormatImpl { "hbase.TableSnapshotInputFormat.locality.enabled"; public static final boolean SNAPSHOT_INPUTFORMAT_LOCALITY_ENABLED_DEFAULT = true; + /** + * Whether to calculate the Snapshot region location by region location from meta. + * It is much faster than computing block locations for splits. + */ + public static final String SNAPSHOT_INPUTFORMAT_LOCALITY_BY_REGION_LOCATION = + "hbase.TableSnapshotInputFormat.locality.by.region.location"; + + public static final boolean SNAPSHOT_INPUTFORMAT_LOCALITY_BY_REGION_LOCATION_DEFAULT = false; + /** * In some scenario, scan limited rows on each InputSplit for sampling data extraction */ @@ -392,17 +405,49 @@ public class TableSnapshotInputFormatImpl { SNAPSHOT_INPUTFORMAT_SCAN_METRICS_ENABLED_DEFAULT); scan.setScanMetricsEnabled(scanMetricsEnabled); + boolean useRegionLoc = conf.getBoolean(SNAPSHOT_INPUTFORMAT_LOCALITY_BY_REGION_LOCATION, + SNAPSHOT_INPUTFORMAT_LOCALITY_BY_REGION_LOCATION_DEFAULT); + + Connection connection = null; + RegionLocator regionLocator = null; + if (localityEnabled && useRegionLoc) { + Configuration newConf = new Configuration(conf); + newConf.setInt("hbase.hconnection.threads.max", 1); + try { + connection = ConnectionFactory.createConnection(newConf); + regionLocator = connection.getRegionLocator(htd.getTableName()); + + /* Get all locations for the table and cache it */ + regionLocator.getAllRegionLocations(); + } finally { + if (connection != null) { + connection.close(); + } + } + } + List splits = new ArrayList<>(); for (RegionInfo hri : regionManifests) { // load region descriptor + List hosts = null; + if (localityEnabled) { + if (regionLocator != null) { + /* Get Location from the local cache */ + HRegionLocation + location = regionLocator.getRegionLocation(hri.getStartKey(), false); + + hosts = new ArrayList<>(1); + hosts.add(location.getHostname()); + } else { + hosts = calculateLocationsForInputSplit(conf, htd, hri, tableDir); + } + } if (numSplits > 1) { byte[][] sp = sa.split(hri.getStartKey(), hri.getEndKey(), numSplits, true); for (int i = 0; i < sp.length - 1; i++) { if (PrivateCellUtil.overlappingKeys(scan.getStartRow(), scan.getStopRow(), sp[i], sp[i + 1])) { - List hosts = - calculateLocationsForInputSplit(conf, htd, hri, tableDir, localityEnabled); Scan boundedScan = new Scan(scan); if (scan.getStartRow().length == 0) { @@ -425,8 +470,7 @@ public class TableSnapshotInputFormatImpl { } else { if (PrivateCellUtil.overlappingKeys(scan.getStartRow(), scan.getStopRow(), hri.getStartKey(), hri.getEndKey())) { - List hosts = - calculateLocationsForInputSplit(conf, htd, hri, tableDir, localityEnabled); + splits.add(new InputSplit(htd, hri, hosts, scan, restoreDir)); } } @@ -440,14 +484,9 @@ public class TableSnapshotInputFormatImpl { * only when localityEnabled is true. */ private static List calculateLocationsForInputSplit(Configuration conf, - TableDescriptor htd, RegionInfo hri, Path tableDir, boolean localityEnabled) - throws IOException { - if (localityEnabled) { // care block locality - return getBestLocations(conf, - HRegion.computeHDFSBlocksDistribution(conf, htd, hri, tableDir)); - } else { // do not care block locality - return null; - } + TableDescriptor htd, RegionInfo hri, Path tableDir) + throws IOException { + return getBestLocations(conf, HRegion.computeHDFSBlocksDistribution(conf, htd, hri, tableDir)); } /** diff --git a/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/mapreduce/TestTableSnapshotInputFormat.java b/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/mapreduce/TestTableSnapshotInputFormat.java index 15a61e0e5c8..34e6b274dab 100644 --- a/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/mapreduce/TestTableSnapshotInputFormat.java +++ b/hbase-mapreduce/src/test/java/org/apache/hadoop/hbase/mapreduce/TestTableSnapshotInputFormat.java @@ -20,6 +20,8 @@ package org.apache.hadoop.hbase.mapreduce; import static org.apache.hadoop.hbase.mapreduce.TableSnapshotInputFormatImpl.SNAPSHOT_INPUTFORMAT_LOCALITY_ENABLED_DEFAULT; import static org.apache.hadoop.hbase.mapreduce.TableSnapshotInputFormatImpl.SNAPSHOT_INPUTFORMAT_LOCALITY_ENABLED_KEY; import static org.apache.hadoop.hbase.mapreduce.TableSnapshotInputFormatImpl.SNAPSHOT_INPUTFORMAT_ROW_LIMIT_PER_INPUTSPLIT; +import static org.apache.hadoop.hbase.mapreduce.TableSnapshotInputFormatImpl.SNAPSHOT_INPUTFORMAT_LOCALITY_BY_REGION_LOCATION; +import static org.apache.hadoop.hbase.mapreduce.TableSnapshotInputFormatImpl.SNAPSHOT_INPUTFORMAT_LOCALITY_BY_REGION_LOCATION_DEFAULT; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.when; @@ -198,6 +200,18 @@ public class TestTableSnapshotInputFormat extends TableSnapshotInputFormatTestBa } } + @Test + public void testWithMockedMapReduceSingleRegionByRegionLocation() throws Exception { + Configuration conf = UTIL.getConfiguration(); + conf.setBoolean(SNAPSHOT_INPUTFORMAT_LOCALITY_BY_REGION_LOCATION, true); + try { + testWithMockedMapReduce(UTIL, name.getMethodName() + "Snapshot", 1, 1, 1, + true); + } finally { + conf.unset(SNAPSHOT_INPUTFORMAT_LOCALITY_BY_REGION_LOCATION); + } + } + @Override public void testRestoreSnapshotDoesNotCreateBackRefLinksInit(TableName tableName, String snapshotName, Path tmpTableDir) throws Exception { @@ -218,6 +232,8 @@ public class TestTableSnapshotInputFormat extends TableSnapshotInputFormatTestBa Configuration conf = util.getConfiguration(); conf.setBoolean(SNAPSHOT_INPUTFORMAT_LOCALITY_ENABLED_KEY, setLocalityEnabledTo); + conf.setBoolean(SNAPSHOT_INPUTFORMAT_LOCALITY_BY_REGION_LOCATION, + SNAPSHOT_INPUTFORMAT_LOCALITY_BY_REGION_LOCATION_DEFAULT); Job job = new Job(conf); Path tmpTableDir = util.getDataTestDirOnTestFS(snapshotName); Scan scan = new Scan().withStartRow(getStartRow()).withStopRow(getEndRow()); // limit the scan @@ -406,6 +422,9 @@ public class TestTableSnapshotInputFormat extends TableSnapshotInputFormatTestBa job.getConfiguration().getBoolean(SNAPSHOT_INPUTFORMAT_LOCALITY_ENABLED_KEY, SNAPSHOT_INPUTFORMAT_LOCALITY_ENABLED_DEFAULT); + boolean byRegionLoc = + job.getConfiguration().getBoolean(SNAPSHOT_INPUTFORMAT_LOCALITY_BY_REGION_LOCATION, + SNAPSHOT_INPUTFORMAT_LOCALITY_BY_REGION_LOCATION_DEFAULT); for (int i = 0; i < splits.size(); i++) { // validate input split InputSplit split = splits.get(i); @@ -413,6 +432,16 @@ public class TestTableSnapshotInputFormat extends TableSnapshotInputFormatTestBa TableSnapshotRegionSplit snapshotRegionSplit = (TableSnapshotRegionSplit) split; if (localityEnabled) { Assert.assertTrue(split.getLocations() != null && split.getLocations().length != 0); + if (byRegionLoc) { + // When it uses region location from meta, the hostname will be "localhost", + // the location from hdfs block location is "127.0.0.1". + Assert.assertEquals(1, split.getLocations().length); + Assert.assertTrue("Not using region location!", + split.getLocations()[0].equals("localhost")); + } else { + Assert.assertTrue("Not using region location!", + split.getLocations()[0].equals("127.0.0.1")); + } } else { Assert.assertTrue(split.getLocations() != null && split.getLocations().length == 0); }