HBASE-14223 Meta WALs are not cleared if meta region was closed and RS aborts

This commit is contained in:
Enis Soztutar 2015-11-30 17:07:54 -08:00
parent a4f445b20e
commit c719e8c456
14 changed files with 291 additions and 14 deletions

View File

@ -0,0 +1,36 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.chaos.actions;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.chaos.factories.MonkeyConstants;
/**
* Action that moves the meta table region(s).
*/
public class MoveMetaAction extends MoveRegionsOfTableAction {
public MoveMetaAction() {
this(-1, MonkeyConstants.DEFAULT_MOVE_REGIONS_MAX_TIME);
}
public MoveMetaAction(long sleepTime, long maxSleepTime) {
super(sleepTime, maxSleepTime, TableName.META_TABLE_NAME);
}
}

View File

@ -30,6 +30,7 @@ import org.apache.hadoop.hbase.chaos.actions.DumpClusterStatusAction;
import org.apache.hadoop.hbase.chaos.actions.FlushRandomRegionOfTableAction;
import org.apache.hadoop.hbase.chaos.actions.FlushTableAction;
import org.apache.hadoop.hbase.chaos.actions.MergeRandomAdjacentRegionsOfTableAction;
import org.apache.hadoop.hbase.chaos.actions.MoveMetaAction;
import org.apache.hadoop.hbase.chaos.actions.MoveRandomRegionOfTableAction;
import org.apache.hadoop.hbase.chaos.actions.MoveRegionsOfTableAction;
import org.apache.hadoop.hbase.chaos.actions.RemoveColumnAction;
@ -52,7 +53,8 @@ public class NoKillMonkeyFactory extends MonkeyFactory {
MonkeyConstants.DEFAULT_COMPACT_RANDOM_REGION_RATIO),
new FlushTableAction(tableName),
new FlushRandomRegionOfTableAction(tableName),
new MoveRandomRegionOfTableAction(tableName)
new MoveRandomRegionOfTableAction(tableName),
new MoveMetaAction()
};
Action[] actions2 = new Action[] {

View File

@ -57,7 +57,8 @@ public class SlowDeterministicMonkeyFactory extends MonkeyFactory {
new CompactRandomRegionOfTableAction(tableName, compactRandomRegionRatio),
new FlushTableAction(tableName),
new FlushRandomRegionOfTableAction(tableName),
new MoveRandomRegionOfTableAction(tableName)
new MoveRandomRegionOfTableAction(tableName),
new MoveMetaAction()
};
// Actions such as split/merge/snapshot.
@ -89,6 +90,7 @@ public class SlowDeterministicMonkeyFactory extends MonkeyFactory {
new RestartRsHoldingMetaAction(restartRsHoldingMetaSleepTime),
new DecreaseMaxHFileSizeAction(decreaseHFileSizeSleepTime, tableName),
new SplitAllRegionOfTableAction(tableName),
new MoveMetaAction()
};
// Action to log more info for debugging

View File

@ -34,7 +34,8 @@ public class StressAssignmentManagerMonkeyFactory extends MonkeyFactory {
new CompactTableAction(tableName, 0.5f),
new CompactRandomRegionOfTableAction(tableName, 0.6f),
new FlushTableAction(tableName),
new FlushRandomRegionOfTableAction(tableName)
new FlushRandomRegionOfTableAction(tableName),
new MoveMetaAction()
};
Action[] actions2 = new Action[]{
@ -55,6 +56,7 @@ public class StressAssignmentManagerMonkeyFactory extends MonkeyFactory {
new SplitAllRegionOfTableAction(tableName),
new DecreaseMaxHFileSizeAction(MonkeyConstants.DEFAULT_DECREASE_HFILE_SIZE_SLEEP_TIME,
tableName),
new MoveMetaAction()
};
// Action to log more info for debugging

View File

@ -1858,6 +1858,26 @@ public class HRegionServer extends HasThread implements
return wal;
}
@Override
public void releaseWAL(HRegionInfo regionInfo, WAL wal) throws IOException {
if (regionInfo != null && regionInfo.isMetaTable() &&
regionInfo.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
walFactory.closeMetaWAL(regionInfo.getEncodedNameAsBytes());
LogRoller roller;
if (regionInfo != null && regionInfo.isMetaTable() &&
regionInfo.getReplicaId() == HRegionInfo.DEFAULT_REPLICA_ID) {
roller = metawalRoller.get();
if (roller != null) {
roller.removeWAL(wal); // only do this for meta WAL
}
// TODO: meta wal roller is left running. Should be fine.
}
}
}
@Override
public ClusterConnection getConnection() {
return this.clusterConnection;

View File

@ -82,6 +82,10 @@ public class LogRoller extends HasThread {
}
}
public void removeWAL(final WAL wal) {
walNeedsRoll.remove(wal);
}
public void requestRollAll() {
for (WAL wal : walNeedsRoll.keySet()) {
walNeedsRoll.put(wal, Boolean.TRUE);

View File

@ -165,7 +165,6 @@ import org.apache.hadoop.hbase.regionserver.Region.Operation;
import org.apache.hadoop.hbase.regionserver.ScannerContext.LimitScope;
import org.apache.hadoop.hbase.regionserver.handler.OpenMetaHandler;
import org.apache.hadoop.hbase.regionserver.handler.OpenRegionHandler;
import org.apache.hadoop.hbase.wal.WAL;
import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
import org.apache.hadoop.hbase.security.User;
import org.apache.hadoop.hbase.util.Bytes;
@ -175,6 +174,7 @@ import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil;
import org.apache.hadoop.hbase.util.Strings;
import org.apache.hadoop.hbase.wal.WAL;
import org.apache.hadoop.hbase.wal.WALKey;
import org.apache.hadoop.hbase.wal.WALSplitter;
import org.apache.hadoop.hbase.zookeeper.ZKSplitLog;
@ -1609,9 +1609,8 @@ public class RSRpcServices implements HBaseRPCErrorHandler,
return response;
}
HRegion.warmupHRegion(region, htd, regionServer.getWAL(region),
HRegion.warmupHRegion(region, htd, null,
regionServer.getConfiguration(), regionServer, null);
} catch (IOException ie) {
LOG.error("Failed warming up region " + region.getRegionNameAsString(), ie);
throw new ServiceException(ie);

View File

@ -54,6 +54,12 @@ public interface RegionServerServices extends OnlineRegions, FavoredNodesForRegi
* default (common) WAL */
WAL getWAL(HRegionInfo regionInfo) throws IOException;
/**
* Releases the dependency of this region to the WAL previously obtained from
* {@link #getWAL(HRegionInfo)}.
*/
void releaseWAL(HRegionInfo regionInfo, WAL wal) throws IOException;
/**
* @return Implementation of {@link CompactionRequestor} or null.
*/

View File

@ -19,6 +19,9 @@
package org.apache.hadoop.hbase.regionserver.handler;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import java.io.IOException;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.Server;
import org.apache.hadoop.hbase.executor.EventType;
@ -40,4 +43,9 @@ public class CloseMetaHandler extends CloseRegionHandler {
super(server, rsServices, regionInfo, abort, closeRegionCoordination,
crd, EventType.M_RS_CLOSE_META);
}
@Override
protected void releaseWALIfNeeded() throws IOException {
rsServices.releaseWAL(regionInfo, region.getWAL());
}
}

View File

@ -46,8 +46,9 @@ public class CloseRegionHandler extends EventHandler {
// have a running queue of user regions to close?
private static final Log LOG = LogFactory.getLog(CloseRegionHandler.class);
private final RegionServerServices rsServices;
private final HRegionInfo regionInfo;
protected final RegionServerServices rsServices;
protected final HRegionInfo regionInfo;
protected HRegion region;
// If true, the hosting server is aborting. Region close process is different
// when we are aborting.
@ -119,7 +120,7 @@ public class CloseRegionHandler extends EventHandler {
LOG.debug("Processing close of " + name);
String encodedRegionName = regionInfo.getEncodedName();
// Check that this region is being served here
HRegion region = (HRegion)rsServices.getFromOnlineRegions(encodedRegionName);
region = (HRegion)rsServices.getFromOnlineRegions(encodedRegionName);
if (region == null) {
LOG.warn("Received CLOSE for region " + name + " but currently not serving - ignoring");
// TODO: do better than a simple warning
@ -143,6 +144,10 @@ public class CloseRegionHandler extends EventHandler {
regionInfo.getRegionNameAsString());
return;
}
if (!abort) {
releaseWALIfNeeded();
}
} catch (IOException ioe) {
// An IOException here indicates that we couldn't successfully flush the
// memstore before closing. So, we need to abort the server and allow
@ -167,4 +172,8 @@ public class CloseRegionHandler extends EventHandler {
remove(this.regionInfo.getEncodedNameAsBytes());
}
}
protected void releaseWALIfNeeded() throws IOException {
// release the WAL if needed. Only meta does this for now.
}
}

View File

@ -251,6 +251,21 @@ public class WALFactory {
return metaProvider.getWAL(identifier, null);
}
/**
* Closes the meta WAL and meta WALProvider
*/
public void closeMetaWAL(final byte[] identifier) throws IOException {
// NOTE: this assumes single META region. The close of WAL does not do ref-counting for the
// number of regions depending on the meta WAL
WALProvider metaProvider = this.metaProvider.get();
if (metaProvider != null) {
if (this.metaProvider.compareAndSet(metaProvider, null)) {
// close the metaProvider
metaProvider.close();
}
}
}
public Reader createReader(final FileSystem fs, final Path path) throws IOException {
return createReader(fs, path, (CancelableProgressable)null);
}

View File

@ -102,7 +102,7 @@ public class MockRegionServerServices implements RegionServerServices {
public List<Region> getOnlineRegions(TableName tableName) throws IOException {
return null;
}
@Override
public Set<TableName> getOnlineTables() {
return null;
@ -181,7 +181,7 @@ public class MockRegionServerServices implements RegionServerServices {
public TableLockManager getTableLockManager() {
return new NullTableLockManager();
}
@Override
public RegionServerQuotaManager getRegionServerQuotaManager() {
return null;
@ -304,4 +304,8 @@ public class MockRegionServerServices implements RegionServerServices {
public double getCompactionPressure() {
return 0;
}
@Override
public void releaseWAL(HRegionInfo regionInfo, WAL wal) throws IOException {
}
}

View File

@ -66,8 +66,6 @@ import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.MergeRegionsReques
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.MergeRegionsResponse;
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionRequest;
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.OpenRegionResponse;
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.WarmupRegionRequest;
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.WarmupRegionResponse;
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.ReplicateWALEntryRequest;
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.ReplicateWALEntryResponse;
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.RollWALWriterRequest;
@ -80,6 +78,8 @@ import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.UpdateConfiguratio
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.UpdateConfigurationResponse;
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.UpdateFavoredNodesRequest;
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.UpdateFavoredNodesResponse;
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.WarmupRegionRequest;
import org.apache.hadoop.hbase.protobuf.generated.AdminProtos.WarmupRegionResponse;
import org.apache.hadoop.hbase.protobuf.generated.ClientProtos;
import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.BulkLoadHFileRequest;
import org.apache.hadoop.hbase.protobuf.generated.ClientProtos.BulkLoadHFileResponse;
@ -541,7 +541,7 @@ ClientProtos.ClientService.BlockingInterface, RegionServerServices {
// TODO Auto-generated method stub
return null;
}
@Override
public Set<TableName> getOnlineTables() {
return null;
@ -649,4 +649,8 @@ ClientProtos.ClientService.BlockingInterface, RegionServerServices {
public double getCompactionPressure() {
return 0;
}
@Override
public void releaseWAL(HRegionInfo regionInfo, WAL wal) throws IOException {
}
}

View File

@ -0,0 +1,166 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.wal;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.Waiter.Predicate;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
import org.apache.hadoop.hbase.client.RegionLocator;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import org.junit.experimental.categories.Category;
/**
* Tests the case where a meta region is opened in one regionserver and closed, there should not
* be any WALs left over.
*/
@Category({MediumTests.class})
public class TestMetaWALsAreClosed {
protected static final Log LOG = LogFactory.getLog(TestMetaWALsAreClosed.class);
protected static final int NUM_RS = 2;
protected static final HBaseTestingUtility TEST_UTIL =
new HBaseTestingUtility();
protected final Configuration conf = TEST_UTIL.getConfiguration();
@Before
public void setUp() throws Exception {
TEST_UTIL.startMiniCluster(1, NUM_RS);
}
@After
public void tearDown() throws Exception {
TEST_UTIL.shutdownMiniCluster();
}
private boolean isHostingMeta(FileSystem fs, Path wals, ServerName serverName)
throws IOException {
for (FileStatus status : fs.listStatus(wals)) {
LOG.info(status.getPath());
if (DefaultWALProvider.isMetaFile(status.getPath())) {
return true; // only 1 meta region for now
}
}
return false;
}
private void moveMetaRegionAndWait(final ServerName target) throws Exception {
try (final Connection conn = ConnectionFactory.createConnection(TEST_UTIL.getConfiguration());
final Admin admin = conn.getAdmin();
final RegionLocator rl = conn.getRegionLocator(TableName.META_TABLE_NAME)) {
LOG.info("Disabling balancer");
admin.setBalancerRunning(false, true);
LOG.info("Moving meta region");
admin.move(HRegionInfo.FIRST_META_REGIONINFO.getEncodedNameAsBytes(),
Bytes.toBytes(target.toString()));
LOG.info("Waiting for meta region to move");
// wait for the move of meta region
TEST_UTIL.waitFor(30000, new Predicate<Exception>() {
@Override
public boolean evaluate() throws Exception {
return target.equals(
rl.getRegionLocation(HConstants.EMPTY_START_ROW, true).getServerName());
}
});
}
}
@Test (timeout = 60000)
public void testMetaWALsAreClosed() throws Exception {
MiniHBaseCluster cluster = TEST_UTIL.getMiniHBaseCluster();
FileSystem fs = TEST_UTIL.getTestFileSystem();
// find the region server hosting the meta table now.
ServerName metaServerName = null;
ServerName otherServerName = null;
for (RegionServerThread rs : cluster.getRegionServerThreads()) {
ServerName serverName = rs.getRegionServer().getServerName();
Path wals = new Path(FSUtils.getRootDir(TEST_UTIL.getConfiguration()),
DefaultWALProvider.getWALDirectoryName(serverName.toString()));
if (isHostingMeta(fs, wals, serverName)) {
metaServerName = serverName; // only 1 meta region for now
} else {
otherServerName = serverName;
}
}
LOG.info(metaServerName);
LOG.info(otherServerName);
assertNotNull(metaServerName);
assertNotNull(otherServerName);
moveMetaRegionAndWait(otherServerName);
LOG.info("Checking that old meta server does not have WALs for meta");
// the server that used to host meta now should not have any WAL files for the meta region now
Path wals = new Path(FSUtils.getRootDir(TEST_UTIL.getConfiguration()),
DefaultWALProvider.getWALDirectoryName(metaServerName.toString()));
for (FileStatus status : fs.listStatus(wals)) {
LOG.info(status.getPath());
assertFalse(DefaultWALProvider.isMetaFile(status.getPath()));
}
// assign the meta server back
moveMetaRegionAndWait(metaServerName);
// do some basic operations to ensure that nothing is failing
HTableDescriptor htd = TEST_UTIL.createTableDescriptor("foo");
TEST_UTIL.getHBaseAdmin().createTable(htd);
try (Connection conn = ConnectionFactory.createConnection(TEST_UTIL.getConfiguration());
Table table = conn.getTable(htd.getTableName())) {
TEST_UTIL.loadNumericRows(table, TEST_UTIL.fam1, 0, 100);
TEST_UTIL.verifyNumericRows(table, TEST_UTIL.fam1, 0, 100, 0);
}
}
}