From a8a2c516a0752ff782f8338502c8b2005049eda3 Mon Sep 17 00:00:00 2001 From: Matteo Bertozzi Date: Wed, 4 May 2016 07:59:44 -0700 Subject: [PATCH] HBASE-15763 Isolate Wal related stuff from MasterFileSystem --- .../apache/hadoop/hbase/master/HMaster.java | 26 +- .../hadoop/hbase/master/MasterFileSystem.java | 321 +--------------- .../hadoop/hbase/master/MasterServices.java | 5 + .../hadoop/hbase/master/MasterWalManager.java | 351 ++++++++++++++++++ .../hadoop/hbase/master/ServerManager.java | 4 +- .../hadoop/hbase/master/SplitLogManager.java | 8 +- .../procedure/ServerCrashProcedure.java | 22 +- .../hbase/master/MockNoopMasterServices.java | 5 + .../hbase/master/TestCatalogJanitor.java | 11 +- .../master/TestDistributedLogSplitting.java | 17 +- .../hbase/master/TestMasterFileSystem.java | 56 +-- .../hbase/master/TestMasterWalManager.java | 104 ++++++ 12 files changed, 533 insertions(+), 397 deletions(-) create mode 100644 hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java create mode 100644 hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterWalManager.java diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index 2f1cd3c929e..d7f7c189ca8 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -270,6 +270,7 @@ public class HMaster extends HRegionServer implements MasterServices { final MetricsMaster metricsMaster; // file system manager for the master FS operations private MasterFileSystem fileSystemManager; + private MasterWalManager walManager; // server manager to deal with region server info volatile ServerManager serverManager; @@ -656,7 +657,8 @@ public class HMaster extends HRegionServer implements MasterServices { this.masterActiveTime = System.currentTimeMillis(); // TODO: Do this using Dependency Injection, using PicoContainer, Guice or Spring. - this.fileSystemManager = new MasterFileSystem(this, this); + this.fileSystemManager = new MasterFileSystem(this); + this.walManager = new MasterWalManager(this); // enable table descriptors cache this.tableDescriptors.setCacheOn(); @@ -715,7 +717,7 @@ public class HMaster extends HRegionServer implements MasterServices { // we recover hbase:meta region servers inside master initialization and // handle other failed servers in SSH in order to start up master node ASAP Set previouslyFailedServers = - this.fileSystemManager.getFailedServersFromLogFolders(); + this.walManager.getFailedServersFromLogFolders(); // log splitting for hbase:meta server ServerName oldMetaServerLocation = metaTableLocator.getMetaRegionLocation(this.getZooKeeper()); @@ -946,11 +948,11 @@ public class HMaster extends HRegionServer implements MasterServices { // TODO: should we prevent from using state manager before meta was initialized? // tableStateManager.start(); - if ((RecoveryMode.LOG_REPLAY == this.getMasterFileSystem().getLogRecoveryMode()) + if ((RecoveryMode.LOG_REPLAY == this.getMasterWalManager().getLogRecoveryMode()) && (!previouslyFailedMetaRSs.isEmpty())) { // replay WAL edits mode need new hbase:meta RS is assigned firstly status.setStatus("replaying log for Meta Region"); - this.fileSystemManager.splitMetaLog(previouslyFailedMetaRSs); + this.walManager.splitMetaLog(previouslyFailedMetaRSs); } this.assignmentManager.setEnabledTable(TableName.META_TABLE_NAME); @@ -985,14 +987,14 @@ public class HMaster extends HRegionServer implements MasterServices { } private void splitMetaLogBeforeAssignment(ServerName currentMetaServer) throws IOException { - if (RecoveryMode.LOG_REPLAY == this.getMasterFileSystem().getLogRecoveryMode()) { + if (RecoveryMode.LOG_REPLAY == this.getMasterWalManager().getLogRecoveryMode()) { // In log replay mode, we mark hbase:meta region as recovering in ZK Set regions = new HashSet(); regions.add(HRegionInfo.FIRST_META_REGIONINFO); - this.fileSystemManager.prepareLogReplay(currentMetaServer, regions); + this.walManager.prepareLogReplay(currentMetaServer, regions); } else { // In recovered.edits mode: create recovered edits file for hbase:meta server - this.fileSystemManager.splitMetaLog(currentMetaServer); + this.walManager.splitMetaLog(currentMetaServer); } } @@ -1045,6 +1047,11 @@ public class HMaster extends HRegionServer implements MasterServices { return this.fileSystemManager; } + @Override + public MasterWalManager getMasterWalManager() { + return this.walManager; + } + @Override public TableStateManager getTableStateManager() { return tableStateManager; @@ -1082,8 +1089,8 @@ public class HMaster extends HRegionServer implements MasterServices { int cleanerInterval = conf.getInt("hbase.master.cleaner.interval", 60 * 1000); this.logCleaner = new LogCleaner(cleanerInterval, - this, conf, getMasterFileSystem().getFileSystem(), - getMasterFileSystem().getOldLogDir()); + this, conf, getMasterWalManager().getFileSystem(), + getMasterWalManager().getOldLogDir()); getChoreService().scheduleChore(logCleaner); //start the hfile archive cleaner thread @@ -1132,6 +1139,7 @@ public class HMaster extends HRegionServer implements MasterServices { if (this.activeMasterManager != null) this.activeMasterManager.stop(); if (this.serverManager != null) this.serverManager.stop(); if (this.assignmentManager != null) this.assignmentManager.stop(); + if (this.walManager != null) this.walManager.stop(); if (this.fileSystemManager != null) this.fileSystemManager.stop(); if (this.mpmHost != null) this.mpmHost.stop("server shutting down."); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java index 43ae2f88a36..ad6e09d0668 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterFileSystem.java @@ -19,99 +19,56 @@ package org.apache.hadoop.hbase.master; import java.io.IOException; -import java.io.InterruptedIOException; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.concurrent.locks.Lock; -import java.util.concurrent.locks.ReentrantLock; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileStatus; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; -import org.apache.hadoop.fs.PathFilter; import org.apache.hadoop.fs.permission.FsPermission; import org.apache.hadoop.hbase.ClusterId; import org.apache.hadoop.hbase.HColumnDescriptor; import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HTableDescriptor; -import org.apache.hadoop.hbase.Server; -import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.TableDescriptor; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.backup.HFileArchiver; +import org.apache.hadoop.hbase.classification.InterfaceAudience; import org.apache.hadoop.hbase.exceptions.DeserializationException; import org.apache.hadoop.hbase.fs.HFileSystem; import org.apache.hadoop.hbase.mob.MobConstants; import org.apache.hadoop.hbase.mob.MobUtils; -import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.SplitLogTask.RecoveryMode; import org.apache.hadoop.hbase.regionserver.HRegion; -import org.apache.hadoop.hbase.wal.DefaultWALProvider; -import org.apache.hadoop.hbase.wal.WALSplitter; import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.FSTableDescriptors; import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.ipc.RemoteException; -import com.google.common.annotations.VisibleForTesting; - /** * This class abstracts a bunch of operations the HMaster needs to interact with - * the underlying file system, including splitting log files, checking file + * the underlying file system like creating the initial layout, checking file * system status, etc. */ @InterfaceAudience.Private public class MasterFileSystem { - private static final Log LOG = LogFactory.getLog(MasterFileSystem.class.getName()); + private static final Log LOG = LogFactory.getLog(MasterFileSystem.class); + // HBase configuration - Configuration conf; - // master status - Server master; - // metrics for master - private final MetricsMasterFileSystem metricsMasterFilesystem = new MetricsMasterFileSystem(); + private final Configuration conf; // Persisted unique cluster ID private ClusterId clusterId; // Keep around for convenience. private final FileSystem fs; - // Is the fileystem ok? - private volatile boolean fsOk = true; - // The Path to the old logs dir - private final Path oldLogDir; // root hbase directory on the FS private final Path rootdir; // hbase temp directory used for table construction and deletion private final Path tempdir; - // create the split log lock - final Lock splitLogLock = new ReentrantLock(); - final boolean distributedLogReplay; - final SplitLogManager splitLogManager; + private final MasterServices services; - final static PathFilter META_FILTER = new PathFilter() { - @Override - public boolean accept(Path p) { - return DefaultWALProvider.isMetaFile(p); - } - }; - - final static PathFilter NON_META_FILTER = new PathFilter() { - @Override - public boolean accept(Path p) { - return !DefaultWALProvider.isMetaFile(p); - } - }; - - public MasterFileSystem(Server master, MasterServices services) - throws IOException { - this.conf = master.getConfiguration(); - this.master = master; + public MasterFileSystem(MasterServices services) throws IOException { + this.conf = services.getConfiguration(); this.services = services; // Set filesystem to be that of this.rootdir else we get complaints about // mismatched filesystems if hbase.rootdir is hdfs and fs.defaultFS is @@ -126,18 +83,8 @@ public class MasterFileSystem { // make sure the fs has the same conf fs.setConf(conf); // setup the filesystem variable - // set up the archived logs path - this.oldLogDir = createInitialFileSystemLayout(); + createInitialFileSystemLayout(); HFileSystem.addLocationsOrderInterceptor(conf); - this.splitLogManager = - new SplitLogManager(master, master.getConfiguration(), master, services, - master.getServerName()); - this.distributedLogReplay = this.splitLogManager.isLogReplaying(); - } - - @VisibleForTesting - SplitLogManager getSplitLogManager() { - return this.splitLogManager; } /** @@ -146,55 +93,23 @@ public class MasterFileSystem { *
  • Check if the meta region exists and is readable, if not create it. * Create hbase.version and the hbase:meta directory if not one. *
  • - *
  • Create a log archive directory for RS to put archived logs
  • * * Idempotent. */ - private Path createInitialFileSystemLayout() throws IOException { + private void createInitialFileSystemLayout() throws IOException { // check if the root directory exists checkRootDir(this.rootdir, conf, this.fs); // check if temp directory exists and clean it checkTempDir(this.tempdir, conf, this.fs); - - Path oldLogDir = new Path(this.rootdir, HConstants.HREGION_OLDLOGDIR_NAME); - - // Make sure the region servers can archive their old logs - if(!this.fs.exists(oldLogDir)) { - this.fs.mkdirs(oldLogDir); - } - - return oldLogDir; } public FileSystem getFileSystem() { return this.fs; } - /** - * Get the directory where old logs go - * @return the dir - */ - public Path getOldLogDir() { - return this.oldLogDir; - } - - /** - * Checks to see if the file system is still accessible. - * If not, sets closed - * @return false if file system is not available - */ - public boolean checkFileSystem() { - if (this.fsOk) { - try { - FSUtils.checkFileSystemAvailable(this.fs); - FSUtils.checkDfsSafeMode(this.conf); - } catch (IOException e) { - master.abort("Shutting down HBase cluster: file system not available", e); - this.fsOk = false; - } - } - return this.fsOk; + public Configuration getConfiguration() { + return this.conf; } /** @@ -218,197 +133,6 @@ public class MasterFileSystem { return clusterId; } - /** - * Inspect the log directory to find dead servers which need recovery work - * @return A set of ServerNames which aren't running but still have WAL files left in file system - */ - Set getFailedServersFromLogFolders() { - boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors", - WALSplitter.SPLIT_SKIP_ERRORS_DEFAULT); - - Set serverNames = new HashSet(); - Path logsDirPath = new Path(this.rootdir, HConstants.HREGION_LOGDIR_NAME); - - do { - if (master.isStopped()) { - LOG.warn("Master stopped while trying to get failed servers."); - break; - } - try { - if (!this.fs.exists(logsDirPath)) return serverNames; - FileStatus[] logFolders = FSUtils.listStatus(this.fs, logsDirPath, null); - // Get online servers after getting log folders to avoid log folder deletion of newly - // checked in region servers . see HBASE-5916 - Set onlineServers = ((HMaster) master).getServerManager().getOnlineServers() - .keySet(); - - if (logFolders == null || logFolders.length == 0) { - LOG.debug("No log files to split, proceeding..."); - return serverNames; - } - for (FileStatus status : logFolders) { - FileStatus[] curLogFiles = FSUtils.listStatus(this.fs, status.getPath(), null); - if (curLogFiles == null || curLogFiles.length == 0) { - // Empty log folder. No recovery needed - continue; - } - final ServerName serverName = DefaultWALProvider.getServerNameFromWALDirectoryName( - status.getPath()); - if (null == serverName) { - LOG.warn("Log folder " + status.getPath() + " doesn't look like its name includes a " + - "region server name; leaving in place. If you see later errors about missing " + - "write ahead logs they may be saved in this location."); - } else if (!onlineServers.contains(serverName)) { - LOG.info("Log folder " + status.getPath() + " doesn't belong " - + "to a known region server, splitting"); - serverNames.add(serverName); - } else { - LOG.info("Log folder " + status.getPath() + " belongs to an existing region server"); - } - } - retrySplitting = false; - } catch (IOException ioe) { - LOG.warn("Failed getting failed servers to be recovered.", ioe); - if (!checkFileSystem()) { - LOG.warn("Bad Filesystem, exiting"); - Runtime.getRuntime().halt(1); - } - try { - if (retrySplitting) { - Thread.sleep(conf.getInt("hbase.hlog.split.failure.retry.interval", 30 * 1000)); - } - } catch (InterruptedException e) { - LOG.warn("Interrupted, aborting since cannot return w/o splitting"); - Thread.currentThread().interrupt(); - retrySplitting = false; - Runtime.getRuntime().halt(1); - } - } - } while (retrySplitting); - - return serverNames; - } - - public void splitLog(final ServerName serverName) throws IOException { - Set serverNames = new HashSet(); - serverNames.add(serverName); - splitLog(serverNames); - } - - /** - * Specialized method to handle the splitting for meta WAL - * @param serverName - * @throws IOException - */ - public void splitMetaLog(final ServerName serverName) throws IOException { - Set serverNames = new HashSet(); - serverNames.add(serverName); - splitMetaLog(serverNames); - } - - /** - * Specialized method to handle the splitting for meta WAL - * @param serverNames - * @throws IOException - */ - public void splitMetaLog(final Set serverNames) throws IOException { - splitLog(serverNames, META_FILTER); - } - - @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UL_UNRELEASED_LOCK", justification= - "We only release this lock when we set it. Updates to code that uses it should verify use " + - "of the guard boolean.") - private List getLogDirs(final Set serverNames) throws IOException { - List logDirs = new ArrayList(); - boolean needReleaseLock = false; - if (!this.services.isInitialized()) { - // during master initialization, we could have multiple places splitting a same wal - this.splitLogLock.lock(); - needReleaseLock = true; - } - try { - for (ServerName serverName : serverNames) { - Path logDir = new Path(this.rootdir, - DefaultWALProvider.getWALDirectoryName(serverName.toString())); - Path splitDir = logDir.suffix(DefaultWALProvider.SPLITTING_EXT); - // Rename the directory so a rogue RS doesn't create more WALs - if (fs.exists(logDir)) { - if (!this.fs.rename(logDir, splitDir)) { - throw new IOException("Failed fs.rename for log split: " + logDir); - } - logDir = splitDir; - LOG.debug("Renamed region directory: " + splitDir); - } else if (!fs.exists(splitDir)) { - LOG.info("Log dir for server " + serverName + " does not exist"); - continue; - } - logDirs.add(splitDir); - } - } finally { - if (needReleaseLock) { - this.splitLogLock.unlock(); - } - } - return logDirs; - } - - /** - * Mark regions in recovering state when distributedLogReplay are set true - * @param serverName Failed region server whose wals to be replayed - * @param regions Set of regions to be recovered - * @throws IOException - */ - public void prepareLogReplay(ServerName serverName, Set regions) throws IOException { - if (!this.distributedLogReplay) { - return; - } - // mark regions in recovering state - if (regions == null || regions.isEmpty()) { - return; - } - this.splitLogManager.markRegionsRecovering(serverName, regions); - } - - public void splitLog(final Set serverNames) throws IOException { - splitLog(serverNames, NON_META_FILTER); - } - - /** - * Wrapper function on {@link SplitLogManager#removeStaleRecoveringRegions(Set)} - * @param failedServers - * @throws IOException - */ - void removeStaleRecoveringRegionsFromZK(final Set failedServers) - throws IOException, InterruptedIOException { - this.splitLogManager.removeStaleRecoveringRegions(failedServers); - } - - /** - * This method is the base split method that splits WAL files matching a filter. Callers should - * pass the appropriate filter for meta and non-meta WALs. - * @param serverNames logs belonging to these servers will be split; this will rename the log - * directory out from under a soft-failed server - * @param filter - * @throws IOException - */ - public void splitLog(final Set serverNames, PathFilter filter) throws IOException { - long splitTime = 0, splitLogSize = 0; - List logDirs = getLogDirs(serverNames); - - splitLogManager.handleDeadWorkers(serverNames); - splitTime = EnvironmentEdgeManager.currentTime(); - splitLogSize = splitLogManager.splitLogDistributed(serverNames, logDirs, filter); - splitTime = EnvironmentEdgeManager.currentTime() - splitTime; - - if (this.metricsMasterFilesystem != null) { - if (filter == META_FILTER) { - this.metricsMasterFilesystem.addMetaWALSplit(splitTime, splitLogSize); - } else { - this.metricsMasterFilesystem.addSplit(splitTime, splitLogSize); - } - } - } - /** * Get the rootdir. Make sure its wholesome and exists before returning. * @param rd @@ -418,9 +142,8 @@ public class MasterFileSystem { * needed populating the directory with necessary bootup files). * @throws IOException */ - private Path checkRootDir(final Path rd, final Configuration c, - final FileSystem fs) - throws IOException { + private Path checkRootDir(final Path rd, final Configuration c, final FileSystem fs) + throws IOException { // If FS is in safe mode wait till out of it. FSUtils.waitOnSafeMode(c, c.getInt(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000)); @@ -595,22 +318,6 @@ public class MasterFileSystem { } public void stop() { - if (splitLogManager != null) { - this.splitLogManager.stop(); - } - } - - /** - * The function is used in SSH to set recovery mode based on configuration after all outstanding - * log split tasks drained. - * @throws IOException - */ - public void setLogRecoveryMode() throws IOException { - this.splitLogManager.setRecoveryMode(false); - } - - public RecoveryMode getLogRecoveryMode() { - return this.splitLogManager.getRecoveryMode(); } public void logFileSystemState(Log log) throws IOException { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java index d095183ebf0..21f14e81ff8 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterServices.java @@ -72,6 +72,11 @@ public interface MasterServices extends Server { */ MasterFileSystem getMasterFileSystem(); + /** + * @return Master's WALs {@link MasterWalManager} utility class. + */ + MasterWalManager getMasterWalManager(); + /** * @return Master's {@link ServerManager} instance. */ diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java new file mode 100644 index 00000000000..d447e2d6249 --- /dev/null +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterWalManager.java @@ -0,0 +1,351 @@ +/** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master; + +import com.google.common.annotations.VisibleForTesting; + +import java.io.IOException; +import java.io.InterruptedIOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.locks.Lock; +import java.util.concurrent.locks.ReentrantLock; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.fs.PathFilter; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.HRegionInfo; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.classification.InterfaceAudience; +import org.apache.hadoop.hbase.protobuf.generated.ZooKeeperProtos.SplitLogTask.RecoveryMode; +import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.apache.hadoop.hbase.util.FSUtils; +import org.apache.hadoop.hbase.wal.DefaultWALProvider; +import org.apache.hadoop.hbase.wal.WALSplitter; + +/** + * This class abstracts a bunch of operations the HMaster needs + * when splitting log files e.g. finding log files, dirs etc. + */ +@InterfaceAudience.Private +public class MasterWalManager { + private static final Log LOG = LogFactory.getLog(MasterWalManager.class); + + final static PathFilter META_FILTER = new PathFilter() { + @Override + public boolean accept(Path p) { + return DefaultWALProvider.isMetaFile(p); + } + }; + + final static PathFilter NON_META_FILTER = new PathFilter() { + @Override + public boolean accept(Path p) { + return !DefaultWALProvider.isMetaFile(p); + } + }; + + // metrics for master + // TODO: Rename it, since those metrics are split-manager related + private final MetricsMasterFileSystem metricsMasterFilesystem = new MetricsMasterFileSystem(); + + // Keep around for convenience. + private final MasterServices services; + private final Configuration conf; + private final FileSystem fs; + + // The Path to the old logs dir + private final Path oldLogDir; + private final Path rootDir; + + // create the split log lock + private final Lock splitLogLock = new ReentrantLock(); + private final SplitLogManager splitLogManager; + private final boolean distributedLogReplay; + + // Is the fileystem ok? + private volatile boolean fsOk = true; + + public MasterWalManager(MasterServices services) throws IOException { + this(services.getConfiguration(), services.getMasterFileSystem().getFileSystem(), + services.getMasterFileSystem().getRootDir(), services); + } + + public MasterWalManager(Configuration conf, FileSystem fs, Path rootDir, MasterServices services) + throws IOException { + this.fs = fs; + this.conf = conf; + this.rootDir = rootDir; + this.services = services; + this.splitLogManager = new SplitLogManager(services, conf, + services, services, services.getServerName()); + this.distributedLogReplay = this.splitLogManager.isLogReplaying(); + + this.oldLogDir = new Path(rootDir, HConstants.HREGION_OLDLOGDIR_NAME); + + // Make sure the region servers can archive their old logs + if (!this.fs.exists(oldLogDir)) { + this.fs.mkdirs(oldLogDir); + } + } + + public void stop() { + if (splitLogManager != null) { + splitLogManager.stop(); + } + } + + @VisibleForTesting + SplitLogManager getSplitLogManager() { + return this.splitLogManager; + } + + /** + * Get the directory where old logs go + * @return the dir + */ + Path getOldLogDir() { + return this.oldLogDir; + } + + public FileSystem getFileSystem() { + return this.fs; + } + + /** + * Checks to see if the file system is still accessible. + * If not, sets closed + * @return false if file system is not available + */ + private boolean checkFileSystem() { + if (this.fsOk) { + try { + FSUtils.checkFileSystemAvailable(this.fs); + FSUtils.checkDfsSafeMode(this.conf); + } catch (IOException e) { + services.abort("Shutting down HBase cluster: file system not available", e); + this.fsOk = false; + } + } + return this.fsOk; + } + + /** + * Inspect the log directory to find dead servers which need recovery work + * @return A set of ServerNames which aren't running but still have WAL files left in file system + */ + Set getFailedServersFromLogFolders() { + boolean retrySplitting = !conf.getBoolean("hbase.hlog.split.skip.errors", + WALSplitter.SPLIT_SKIP_ERRORS_DEFAULT); + + Set serverNames = new HashSet(); + Path logsDirPath = new Path(this.rootDir, HConstants.HREGION_LOGDIR_NAME); + + do { + if (services.isStopped()) { + LOG.warn("Master stopped while trying to get failed servers."); + break; + } + try { + if (!this.fs.exists(logsDirPath)) return serverNames; + FileStatus[] logFolders = FSUtils.listStatus(this.fs, logsDirPath, null); + // Get online servers after getting log folders to avoid log folder deletion of newly + // checked in region servers . see HBASE-5916 + Set onlineServers = services.getServerManager().getOnlineServers().keySet(); + + if (logFolders == null || logFolders.length == 0) { + LOG.debug("No log files to split, proceeding..."); + return serverNames; + } + for (FileStatus status : logFolders) { + FileStatus[] curLogFiles = FSUtils.listStatus(this.fs, status.getPath(), null); + if (curLogFiles == null || curLogFiles.length == 0) { + // Empty log folder. No recovery needed + continue; + } + final ServerName serverName = DefaultWALProvider.getServerNameFromWALDirectoryName( + status.getPath()); + if (null == serverName) { + LOG.warn("Log folder " + status.getPath() + " doesn't look like its name includes a " + + "region server name; leaving in place. If you see later errors about missing " + + "write ahead logs they may be saved in this location."); + } else if (!onlineServers.contains(serverName)) { + LOG.info("Log folder " + status.getPath() + " doesn't belong " + + "to a known region server, splitting"); + serverNames.add(serverName); + } else { + LOG.info("Log folder " + status.getPath() + " belongs to an existing region server"); + } + } + retrySplitting = false; + } catch (IOException ioe) { + LOG.warn("Failed getting failed servers to be recovered.", ioe); + if (!checkFileSystem()) { + LOG.warn("Bad Filesystem, exiting"); + Runtime.getRuntime().halt(1); + } + try { + if (retrySplitting) { + Thread.sleep(conf.getInt("hbase.hlog.split.failure.retry.interval", 30 * 1000)); + } + } catch (InterruptedException e) { + LOG.warn("Interrupted, aborting since cannot return w/o splitting"); + Thread.currentThread().interrupt(); + retrySplitting = false; + Runtime.getRuntime().halt(1); + } + } + } while (retrySplitting); + + return serverNames; + } + + public void splitLog(final ServerName serverName) throws IOException { + Set serverNames = new HashSet(); + serverNames.add(serverName); + splitLog(serverNames); + } + + /** + * Specialized method to handle the splitting for meta WAL + * @param serverName logs belonging to this server will be split + */ + public void splitMetaLog(final ServerName serverName) throws IOException { + Set serverNames = new HashSet(); + serverNames.add(serverName); + splitMetaLog(serverNames); + } + + /** + * Specialized method to handle the splitting for meta WAL + * @param serverNames logs belonging to these servers will be split + */ + public void splitMetaLog(final Set serverNames) throws IOException { + splitLog(serverNames, META_FILTER); + } + + @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="UL_UNRELEASED_LOCK", justification= + "We only release this lock when we set it. Updates to code that uses it should verify use " + + "of the guard boolean.") + private List getLogDirs(final Set serverNames) throws IOException { + List logDirs = new ArrayList(); + boolean needReleaseLock = false; + if (!this.services.isInitialized()) { + // during master initialization, we could have multiple places splitting a same wal + this.splitLogLock.lock(); + needReleaseLock = true; + } + try { + for (ServerName serverName : serverNames) { + Path logDir = new Path(this.rootDir, + DefaultWALProvider.getWALDirectoryName(serverName.toString())); + Path splitDir = logDir.suffix(DefaultWALProvider.SPLITTING_EXT); + // Rename the directory so a rogue RS doesn't create more WALs + if (fs.exists(logDir)) { + if (!this.fs.rename(logDir, splitDir)) { + throw new IOException("Failed fs.rename for log split: " + logDir); + } + logDir = splitDir; + LOG.debug("Renamed region directory: " + splitDir); + } else if (!fs.exists(splitDir)) { + LOG.info("Log dir for server " + serverName + " does not exist"); + continue; + } + logDirs.add(splitDir); + } + } finally { + if (needReleaseLock) { + this.splitLogLock.unlock(); + } + } + return logDirs; + } + + /** + * Mark regions in recovering state when distributedLogReplay are set true + * @param serverName Failed region server whose wals to be replayed + * @param regions Set of regions to be recovered + */ + public void prepareLogReplay(ServerName serverName, Set regions) throws IOException { + if (!this.distributedLogReplay) { + return; + } + // mark regions in recovering state + if (regions == null || regions.isEmpty()) { + return; + } + this.splitLogManager.markRegionsRecovering(serverName, regions); + } + + public void splitLog(final Set serverNames) throws IOException { + splitLog(serverNames, NON_META_FILTER); + } + + /** + * Wrapper function on {@link SplitLogManager#removeStaleRecoveringRegions(Set)} + * @param failedServers A set of known failed servers + */ + void removeStaleRecoveringRegionsFromZK(final Set failedServers) + throws IOException, InterruptedIOException { + this.splitLogManager.removeStaleRecoveringRegions(failedServers); + } + + /** + * This method is the base split method that splits WAL files matching a filter. Callers should + * pass the appropriate filter for meta and non-meta WALs. + * @param serverNames logs belonging to these servers will be split; this will rename the log + * directory out from under a soft-failed server + */ + public void splitLog(final Set serverNames, PathFilter filter) throws IOException { + long splitTime = 0, splitLogSize = 0; + List logDirs = getLogDirs(serverNames); + + splitLogManager.handleDeadWorkers(serverNames); + splitTime = EnvironmentEdgeManager.currentTime(); + splitLogSize = splitLogManager.splitLogDistributed(serverNames, logDirs, filter); + splitTime = EnvironmentEdgeManager.currentTime() - splitTime; + + if (this.metricsMasterFilesystem != null) { + if (filter == META_FILTER) { + this.metricsMasterFilesystem.addMetaWALSplit(splitTime, splitLogSize); + } else { + this.metricsMasterFilesystem.addSplit(splitTime, splitLogSize); + } + } + } + + /** + * The function is used in SSH to set recovery mode based on configuration after all outstanding + * log split tasks drained. + */ + public void setLogRecoveryMode() throws IOException { + this.splitLogManager.setRecoveryMode(false); + } + + public RecoveryMode getLogRecoveryMode() { + return this.splitLogManager.getRecoveryMode(); + } +} \ No newline at end of file diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java index d69c7aa9a15..37659f8c963 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -753,7 +753,7 @@ public class ServerManager { } OpenRegionRequest request = RequestConverter.buildOpenRegionRequest(server, region, favoredNodes, - (RecoveryMode.LOG_REPLAY == this.services.getMasterFileSystem().getLogRecoveryMode())); + (RecoveryMode.LOG_REPLAY == this.services.getMasterWalManager().getLogRecoveryMode())); try { OpenRegionResponse response = admin.openRegion(null, request); return ResponseConverter.getRegionOpeningState(response); @@ -781,7 +781,7 @@ public class ServerManager { } OpenRegionRequest request = RequestConverter.buildOpenRegionRequest(server, regionOpenInfos, - (RecoveryMode.LOG_REPLAY == this.services.getMasterFileSystem().getLogRecoveryMode())); + (RecoveryMode.LOG_REPLAY == this.services.getMasterWalManager().getLogRecoveryMode())); try { OpenRegionResponse response = admin.openRegion(null, request); return ResponseConverter.getRegionOpeningStateList(response); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java index 4c2e745a31b..5d488dd0c5d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/SplitLogManager.java @@ -75,8 +75,8 @@ import com.google.common.annotations.VisibleForTesting; *

    SplitLogManager monitors the tasks that it creates using the * timeoutMonitor thread. If a task's progress is slow then * {@link SplitLogManagerCoordination#checkTasks} will take away the - * task from the owner {@link org.apache.hadoop.hbase.regionserver.SplitLogWorker} - * and the task will be up for grabs again. When the task is done then it is + * task from the owner {@link org.apache.hadoop.hbase.regionserver.SplitLogWorker} + * and the task will be up for grabs again. When the task is done then it is * deleted by SplitLogManager. * *

    Clients call {@link #splitLogDistributed(Path)} to split a region server's @@ -273,7 +273,7 @@ public class SplitLogManager { } waitForSplittingCompletion(batch, status); // remove recovering regions - if (filter == MasterFileSystem.META_FILTER /* reference comparison */) { + if (filter == MasterWalManager.META_FILTER /* reference comparison */) { // we split meta regions and user regions separately therefore logfiles are either all for // meta or user regions but won't for both( we could have mixed situations in tests) isMetaRecovery = true; @@ -411,7 +411,7 @@ public class SplitLogManager { for (ServerName tmpServerName : serverNames) { recoveredServerNameSet.add(tmpServerName.getServerName()); } - + this.recoveringRegionLock.lock(); try { ((BaseCoordinatedStateManager) server.getCoordinatedStateManager()) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java index 7de694cdea3..2fbb5598fe6 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/ServerCrashProcedure.java @@ -37,8 +37,8 @@ import org.apache.hadoop.hbase.client.ClusterConnection; import org.apache.hadoop.hbase.client.RegionReplicaUtil; import org.apache.hadoop.hbase.client.TableState; import org.apache.hadoop.hbase.master.AssignmentManager; -import org.apache.hadoop.hbase.master.MasterFileSystem; import org.apache.hadoop.hbase.master.MasterServices; +import org.apache.hadoop.hbase.master.MasterWalManager; import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.master.RegionStates; import org.apache.hadoop.hbase.procedure2.ProcedureYieldException; @@ -320,10 +320,10 @@ implements ServerProcedureInterface { * @throws IOException */ private void start(final MasterProcedureEnv env) throws IOException { - MasterFileSystem mfs = env.getMasterServices().getMasterFileSystem(); + MasterWalManager mwm = env.getMasterServices().getMasterWalManager(); // Set recovery mode late. This is what the old ServerShutdownHandler used do. - mfs.setLogRecoveryMode(); - this.distributedLogReplay = mfs.getLogRecoveryMode() == RecoveryMode.LOG_REPLAY; + mwm.setLogRecoveryMode(); + this.distributedLogReplay = mwm.getLogRecoveryMode() == RecoveryMode.LOG_REPLAY; } /** @@ -335,7 +335,7 @@ implements ServerProcedureInterface { private boolean processMeta(final MasterProcedureEnv env) throws IOException { if (LOG.isDebugEnabled()) LOG.debug("Processing hbase:meta that was on " + this.serverName); - MasterFileSystem mfs = env.getMasterServices().getMasterFileSystem(); + MasterWalManager mwm = env.getMasterServices().getMasterWalManager(); AssignmentManager am = env.getMasterServices().getAssignmentManager(); HRegionInfo metaHRI = HRegionInfo.FIRST_META_REGIONINFO; if (this.shouldSplitWal) { @@ -343,7 +343,7 @@ implements ServerProcedureInterface { prepareLogReplay(env, META_REGION_SET); } else { // TODO: Matteo. We BLOCK here but most important thing to be doing at this moment. - mfs.splitMetaLog(serverName); + mwm.splitMetaLog(serverName); am.getRegionStates().logSplit(metaHRI); } } @@ -360,7 +360,7 @@ implements ServerProcedureInterface { processed = false; } else { // TODO: Matteo. We BLOCK here but most important thing to be doing at this moment. - mfs.splitMetaLog(serverName); + mwm.splitMetaLog(serverName); } } } @@ -394,9 +394,9 @@ implements ServerProcedureInterface { LOG.debug("Mark " + size(this.regionsOnCrashedServer) + " regions-in-recovery from " + this.serverName); } - MasterFileSystem mfs = env.getMasterServices().getMasterFileSystem(); + MasterWalManager mwm = env.getMasterServices().getMasterWalManager(); AssignmentManager am = env.getMasterServices().getAssignmentManager(); - mfs.prepareLogReplay(this.serverName, regions); + mwm.prepareLogReplay(this.serverName, regions); am.getRegionStates().logSplit(this.serverName); } @@ -405,10 +405,10 @@ implements ServerProcedureInterface { LOG.debug("Splitting logs from " + serverName + "; region count=" + size(this.regionsOnCrashedServer)); } - MasterFileSystem mfs = env.getMasterServices().getMasterFileSystem(); + MasterWalManager mwm = env.getMasterServices().getMasterWalManager(); AssignmentManager am = env.getMasterServices().getAssignmentManager(); // TODO: For Matteo. Below BLOCKs!!!! Redo so can relinquish executor while it is running. - mfs.splitLog(this.serverName); + mwm.splitLog(this.serverName); am.getRegionStates().logSplit(this.serverName); } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java index 0882716f7ab..60b62e4e311 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/MockNoopMasterServices.java @@ -97,6 +97,11 @@ public class MockNoopMasterServices implements MasterServices, Server { return null; } + @Override + public MasterWalManager getMasterWalManager() { + return null; + } + @Override public MasterCoprocessorHost getMasterCoprocessorHost() { return null; diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitor.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitor.java index e850ed0f4a0..cc8e2d830e0 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitor.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestCatalogJanitor.java @@ -221,9 +221,11 @@ public class TestCatalogJanitor { class MockMasterServices extends MockNoopMasterServices { private final MasterFileSystem mfs; private final AssignmentManager asm; + private final Server server; MockMasterServices(final Server server) throws IOException { - this.mfs = new MasterFileSystem(server, this); + this.server = server; + this.mfs = new MasterFileSystem(this); this.asm = Mockito.mock(AssignmentManager.class); } @@ -239,7 +241,12 @@ public class TestCatalogJanitor { @Override public Configuration getConfiguration() { - return mfs.conf; + return server.getConfiguration(); + } + + @Override + public ServerName getServerName() { + return server.getServerName(); } @Override diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java index 4f2385fc671..170a882f924 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java @@ -1,6 +1,5 @@ /** * - * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -213,7 +212,7 @@ public class TestDistributedLogSplitting { startCluster(NUM_RS); final int NUM_LOG_LINES = 1000; - final SplitLogManager slm = master.getMasterFileSystem().splitLogManager; + final SplitLogManager slm = master.getMasterWalManager().getSplitLogManager(); // turn off load balancing to prevent regions from moving around otherwise // they will consume recovered.edits master.balanceSwitch(false); @@ -680,7 +679,7 @@ public class TestDistributedLogSplitting { final ZooKeeperWatcher zkw = master.getZooKeeper(); Table ht = installTable(zkw, "table", "family", 40); try { - final SplitLogManager slm = master.getMasterFileSystem().splitLogManager; + final SplitLogManager slm = master.getMasterWalManager().getSplitLogManager(); Set regionSet = new HashSet(); HRegionInfo region = null; @@ -929,7 +928,7 @@ public class TestDistributedLogSplitting { final ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "table-creation", null); Table ht = installTable(zkw, "table", "family", NUM_REGIONS_TO_CREATE); try { - final SplitLogManager slm = master.getMasterFileSystem().splitLogManager; + final SplitLogManager slm = master.getMasterWalManager().getSplitLogManager(); Set regionSet = new HashSet(); HRegionInfo region = null; @@ -1004,7 +1003,7 @@ public class TestDistributedLogSplitting { LOG.info("testWorkerAbort"); startCluster(3); final int NUM_LOG_LINES = 10000; - final SplitLogManager slm = master.getMasterFileSystem().splitLogManager; + final SplitLogManager slm = master.getMasterWalManager().getSplitLogManager(); FileSystem fs = master.getMasterFileSystem().getFileSystem(); final List rsts = cluster.getLiveRegionServerThreads(); @@ -1123,7 +1122,7 @@ public class TestDistributedLogSplitting { public void testDelayedDeleteOnFailure() throws Exception { LOG.info("testDelayedDeleteOnFailure"); startCluster(1); - final SplitLogManager slm = master.getMasterFileSystem().splitLogManager; + final SplitLogManager slm = master.getMasterWalManager().getSplitLogManager(); final FileSystem fs = master.getMasterFileSystem().getFileSystem(); final Path logDir = new Path(FSUtils.getRootDir(conf), "x"); fs.mkdirs(logDir); @@ -1204,10 +1203,10 @@ public class TestDistributedLogSplitting { LOG.info("#regions = " + regions.size()); Set tmpRegions = new HashSet(); tmpRegions.add(HRegionInfo.FIRST_META_REGIONINFO); - master.getMasterFileSystem().prepareLogReplay(hrs.getServerName(), tmpRegions); + master.getMasterWalManager().prepareLogReplay(hrs.getServerName(), tmpRegions); Set userRegionSet = new HashSet(); userRegionSet.addAll(regions); - master.getMasterFileSystem().prepareLogReplay(hrs.getServerName(), userRegionSet); + master.getMasterWalManager().prepareLogReplay(hrs.getServerName(), userRegionSet); boolean isMetaRegionInRecovery = false; List recoveringRegions = zkw.getRecoverableZooKeeper().getChildren(zkw.recoveringRegionsZNode, false); @@ -1219,7 +1218,7 @@ public class TestDistributedLogSplitting { } assertTrue(isMetaRegionInRecovery); - master.getMasterFileSystem().splitMetaLog(hrs.getServerName()); + master.getMasterWalManager().splitMetaLog(hrs.getServerName()); isMetaRegionInRecovery = false; recoveringRegions = diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFileSystem.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFileSystem.java index 053464329fd..bf13e7fc5dc 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFileSystem.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFileSystem.java @@ -21,25 +21,15 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertTrue; -import java.util.HashSet; -import java.util.Set; - import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.HConstants; -import org.apache.hadoop.hbase.ServerName; -import org.apache.hadoop.hbase.SplitLogTask; import org.apache.hadoop.hbase.testclassification.MasterTests; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.util.FSUtils; -import org.apache.hadoop.hbase.zookeeper.ZKSplitLog; -import org.apache.hadoop.hbase.zookeeper.ZKUtil; -import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; -import org.apache.zookeeper.CreateMode; -import org.apache.zookeeper.ZooDefs.Ids; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; @@ -50,8 +40,8 @@ import org.junit.experimental.categories.Category; */ @Category({MasterTests.class, MediumTests.class}) public class TestMasterFileSystem { - private static final Log LOG = LogFactory.getLog(TestMasterFileSystem.class); + private static final HBaseTestingUtility UTIL = new HBaseTestingUtility(); @BeforeClass @@ -68,52 +58,12 @@ public class TestMasterFileSystem { public void testFsUriSetProperly() throws Exception { HMaster master = UTIL.getMiniHBaseCluster().getMaster(); MasterFileSystem fs = master.getMasterFileSystem(); - Path masterRoot = FSUtils.getRootDir(fs.conf); + Path masterRoot = FSUtils.getRootDir(fs.getConfiguration()); Path rootDir = FSUtils.getRootDir(fs.getFileSystem().getConf()); // make sure the fs and the found root dir have the same scheme LOG.debug("from fs uri:" + FileSystem.getDefaultUri(fs.getFileSystem().getConf())); - LOG.debug("from configuration uri:" + FileSystem.getDefaultUri(fs.conf)); + LOG.debug("from configuration uri:" + FileSystem.getDefaultUri(fs.getConfiguration())); // make sure the set uri matches by forcing it. assertEquals(masterRoot, rootDir); } - - @Test - public void testRemoveStaleRecoveringRegionsDuringMasterInitialization() throws Exception { - // this test is for when distributed log replay is enabled - if (!UTIL.getConfiguration().getBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY, false)) return; - - LOG.info("Starting testRemoveStaleRecoveringRegionsDuringMasterInitialization"); - HMaster master = UTIL.getMiniHBaseCluster().getMaster(); - MasterFileSystem fs = master.getMasterFileSystem(); - - String failedRegion = "failedRegoin1"; - String staleRegion = "staleRegion"; - ServerName inRecoveryServerName = ServerName.valueOf("mgr,1,1"); - ServerName previouselyFaildServerName = ServerName.valueOf("previous,1,1"); - String walPath = "/hbase/data/.logs/" + inRecoveryServerName.getServerName() - + "-splitting/test"; - // Create a ZKW to use in the test - ZooKeeperWatcher zkw = HBaseTestingUtility.getZooKeeperWatcher(UTIL); - zkw.getRecoverableZooKeeper().create(ZKSplitLog.getEncodedNodeName(zkw, walPath), - new SplitLogTask.Owned(inRecoveryServerName, fs.getLogRecoveryMode()).toByteArray(), - Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); - String staleRegionPath = ZKUtil.joinZNode(zkw.recoveringRegionsZNode, staleRegion); - ZKUtil.createWithParents(zkw, staleRegionPath); - String inRecoveringRegionPath = ZKUtil.joinZNode(zkw.recoveringRegionsZNode, failedRegion); - inRecoveringRegionPath = ZKUtil.joinZNode(inRecoveringRegionPath, - inRecoveryServerName.getServerName()); - ZKUtil.createWithParents(zkw, inRecoveringRegionPath); - Set servers = new HashSet(); - servers.add(previouselyFaildServerName); - fs.removeStaleRecoveringRegionsFromZK(servers); - - // verification - assertFalse(ZKUtil.checkExists(zkw, staleRegionPath) != -1); - assertTrue(ZKUtil.checkExists(zkw, inRecoveringRegionPath) != -1); - - ZKUtil.deleteChildrenRecursively(zkw, zkw.recoveringRegionsZNode); - ZKUtil.deleteChildrenRecursively(zkw, zkw.splitLogZNode); - zkw.close(); - } - } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterWalManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterWalManager.java new file mode 100644 index 00000000000..feb97d93252 --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterWalManager.java @@ -0,0 +1,104 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +import java.util.HashSet; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.SplitLogTask; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.util.FSUtils; +import org.apache.hadoop.hbase.zookeeper.ZKSplitLog; +import org.apache.hadoop.hbase.zookeeper.ZKUtil; +import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher; +import org.apache.zookeeper.CreateMode; +import org.apache.zookeeper.ZooDefs.Ids; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +/** + * Test the master wal manager in a local cluster + */ +@Category({MasterTests.class, MediumTests.class}) +public class TestMasterWalManager { + private static final Log LOG = LogFactory.getLog(TestMasterWalManager.class); + private static final HBaseTestingUtility UTIL = new HBaseTestingUtility(); + + @BeforeClass + public static void setupTest() throws Exception { + UTIL.startMiniCluster(); + } + + @AfterClass + public static void teardownTest() throws Exception { + UTIL.shutdownMiniCluster(); + } + + @Test + public void testRemoveStaleRecoveringRegionsDuringMasterInitialization() throws Exception { + // this test is for when distributed log replay is enabled + if (!UTIL.getConfiguration().getBoolean(HConstants.DISTRIBUTED_LOG_REPLAY_KEY, false)) return; + + LOG.info("Starting testRemoveStaleRecoveringRegionsDuringMasterInitialization"); + HMaster master = UTIL.getMiniHBaseCluster().getMaster(); + MasterWalManager mwm = master.getMasterWalManager(); + + String failedRegion = "failedRegoin1"; + String staleRegion = "staleRegion"; + ServerName inRecoveryServerName = ServerName.valueOf("mgr,1,1"); + ServerName previouselyFaildServerName = ServerName.valueOf("previous,1,1"); + String walPath = "/hbase/data/.logs/" + inRecoveryServerName.getServerName() + + "-splitting/test"; + // Create a ZKW to use in the test + ZooKeeperWatcher zkw = HBaseTestingUtility.getZooKeeperWatcher(UTIL); + zkw.getRecoverableZooKeeper().create(ZKSplitLog.getEncodedNodeName(zkw, walPath), + new SplitLogTask.Owned(inRecoveryServerName, mwm.getLogRecoveryMode()).toByteArray(), + Ids.OPEN_ACL_UNSAFE, CreateMode.PERSISTENT); + String staleRegionPath = ZKUtil.joinZNode(zkw.recoveringRegionsZNode, staleRegion); + ZKUtil.createWithParents(zkw, staleRegionPath); + String inRecoveringRegionPath = ZKUtil.joinZNode(zkw.recoveringRegionsZNode, failedRegion); + inRecoveringRegionPath = ZKUtil.joinZNode(inRecoveringRegionPath, + inRecoveryServerName.getServerName()); + ZKUtil.createWithParents(zkw, inRecoveringRegionPath); + Set servers = new HashSet(); + servers.add(previouselyFaildServerName); + mwm.removeStaleRecoveringRegionsFromZK(servers); + + // verification + assertFalse(ZKUtil.checkExists(zkw, staleRegionPath) != -1); + assertTrue(ZKUtil.checkExists(zkw, inRecoveringRegionPath) != -1); + + ZKUtil.deleteChildrenRecursively(zkw, zkw.recoveringRegionsZNode); + ZKUtil.deleteChildrenRecursively(zkw, zkw.splitLogZNode); + zkw.close(); + } +}