HBASE-21155 Save on a few log strings and some churn in wal splitter by skipping out early if no logs in dir

This commit is contained in:
Michael Stack 2018-09-05 15:26:16 -07:00
parent 2051b0982d
commit 205783419c
No known key found for this signature in database
GPG Key ID: 9816C7FC8ACC93D2
7 changed files with 54 additions and 32 deletions

View File

@ -83,6 +83,7 @@ public final class ProcedureWALFormat {
// Ignore the last log which is current active log. // Ignore the last log which is current active log.
while (logs.hasNext()) { while (logs.hasNext()) {
ProcedureWALFile log = logs.next(); ProcedureWALFile log = logs.next();
LOG.debug("Loading WAL id={}", log.getLogId());
log.open(); log.open();
try { try {
reader.read(log); reader.read(log);

View File

@ -424,6 +424,8 @@ public class WALProcedureStore extends ProcedureStoreBase {
it.next(); // Skip the current log it.next(); // Skip the current log
ProcedureWALFormat.load(it, storeTracker, new ProcedureWALFormat.Loader() { ProcedureWALFormat.load(it, storeTracker, new ProcedureWALFormat.Loader() {
long count = 0;
@Override @Override
public void setMaxProcId(long maxProcId) { public void setMaxProcId(long maxProcId) {
loader.setMaxProcId(maxProcId); loader.setMaxProcId(maxProcId);
@ -431,6 +433,11 @@ public class WALProcedureStore extends ProcedureStoreBase {
@Override @Override
public void load(ProcedureIterator procIter) throws IOException { public void load(ProcedureIterator procIter) throws IOException {
if ((++count % 1000) == 0) {
// Log every 1000 procedures otherwise it looks like Master is dead if loads of WALs
// and procedures to load.
LOG.debug("Loaded {} procedures", this.count);
}
loader.load(procIter); loader.load(procIter);
} }

View File

@ -574,7 +574,7 @@ public class HMaster extends HRegionServer implements MasterServices {
abort(error, t); abort(error, t);
} }
} }
})); }), getName() + ":becomeActiveMaster");
} }
// Fall in here even if we have been aborted. Need to run the shutdown services and // Fall in here even if we have been aborted. Need to run the shutdown services and
// the super run call will do this for us. // the super run call will do this for us.
@ -873,12 +873,15 @@ public class HMaster extends HRegionServer implements MasterServices {
HBaseFsck.createLockRetryCounterFactory(this.conf).create()); HBaseFsck.createLockRetryCounterFactory(this.conf).create());
} }
status.setStatus("Initialze ServerManager and schedule SCP for crash servers"); status.setStatus("Initialize ServerManager and schedule SCP for crash servers");
this.serverManager = createServerManager(this); this.serverManager = createServerManager(this);
createProcedureExecutor(); createProcedureExecutor();
// Create Assignment Manager // Create Assignment Manager
this.assignmentManager = new AssignmentManager(this); this.assignmentManager = new AssignmentManager(this);
this.assignmentManager.start(); this.assignmentManager.start();
// Start RegionServerTracker with listing of servers found with exiting SCPs -- these should
// be registered in the deadServers set -- and with the list of servernames out on the
// filesystem that COULD BE 'alive' (we'll schedule SCPs for each and let SCP figure it out).
this.regionServerTracker = new RegionServerTracker(zooKeeper, this, this.serverManager); this.regionServerTracker = new RegionServerTracker(zooKeeper, this, this.serverManager);
this.regionServerTracker.start( this.regionServerTracker.start(
procedureExecutor.getProcedures().stream().filter(p -> p instanceof ServerCrashProcedure) procedureExecutor.getProcedures().stream().filter(p -> p instanceof ServerCrashProcedure)

View File

@ -143,6 +143,11 @@ public class MasterWalManager {
return this.fsOk; return this.fsOk;
} }
/**
* @return Return listing of ServerNames found in the filesystem under the WAL directory
* that COULD BE 'alive'; excludes those that have a '-splitting' suffix as these are already
* being split -- they cannot be 'alive'.
*/
public Set<ServerName> getLiveServersFromWALDir() throws IOException { public Set<ServerName> getLiveServersFromWALDir() throws IOException {
Path walDirPath = new Path(rootDir, HConstants.HREGION_LOGDIR_NAME); Path walDirPath = new Path(rootDir, HConstants.HREGION_LOGDIR_NAME);
FileStatus[] walDirForLiveServers = FSUtils.listStatus(fs, walDirPath, FileStatus[] walDirForLiveServers = FSUtils.listStatus(fs, walDirPath,

View File

@ -111,13 +111,15 @@ public class RegionServerTracker extends ZKListener {
* In this method, we will also construct the region server sets in {@link ServerManager}. If a * In this method, we will also construct the region server sets in {@link ServerManager}. If a
* region server is dead between the crash of the previous master instance and the start of the * region server is dead between the crash of the previous master instance and the start of the
* current master instance, we will schedule a SCP for it. This is done in * current master instance, we will schedule a SCP for it. This is done in
* {@link ServerManager#findOutDeadServersAndProcess(Set, Set)}, we call it here under the lock * {@link ServerManager#findDeadServersAndProcess(Set, Set)}, we call it here under the lock
* protection to prevent concurrency issues with server expiration operation. * protection to prevent concurrency issues with server expiration operation.
* @param deadServersFromPE the region servers which already have SCP associated. * @param deadServersFromPE the region servers which already have SCP associated.
* @param liveServersFromWALDir the live region servers from wal directory. * @param liveServersFromWALDir the live region servers from wal directory.
*/ */
public void start(Set<ServerName> deadServersFromPE, Set<ServerName> liveServersFromWALDir) public void start(Set<ServerName> deadServersFromPE, Set<ServerName> liveServersFromWALDir)
throws KeeperException, IOException { throws KeeperException, IOException {
LOG.info("Starting RegionServerTracker; {} have existing ServerCrashProcedures, {} " +
"possibly 'live' servers.", deadServersFromPE.size(), liveServersFromWALDir.size());
watcher.registerListener(this); watcher.registerListener(this);
synchronized (this) { synchronized (this) {
List<String> servers = List<String> servers =
@ -132,7 +134,7 @@ public class RegionServerTracker extends ZKListener {
info.getVersionInfo().getVersion()) : ServerMetricsBuilder.of(serverName); info.getVersionInfo().getVersion()) : ServerMetricsBuilder.of(serverName);
serverManager.checkAndRecordNewServer(serverName, serverMetrics); serverManager.checkAndRecordNewServer(serverName, serverMetrics);
} }
serverManager.findOutDeadServersAndProcess(deadServersFromPE, liveServersFromWALDir); serverManager.findDeadServersAndProcess(deadServersFromPE, liveServersFromWALDir);
} }
} }

View File

@ -313,10 +313,10 @@ public class ServerManager {
* <p/> * <p/>
* Must be called inside the initialization method of {@code RegionServerTracker} to avoid * Must be called inside the initialization method of {@code RegionServerTracker} to avoid
* concurrency issue. * concurrency issue.
* @param deadServersFromPE the region servers which already have SCP associated. * @param deadServersFromPE the region servers which already have a SCP associated.
* @param liveServersFromWALDir the live region servers from wal directory. * @param liveServersFromWALDir the live region servers from wal directory.
*/ */
void findOutDeadServersAndProcess(Set<ServerName> deadServersFromPE, void findDeadServersAndProcess(Set<ServerName> deadServersFromPE,
Set<ServerName> liveServersFromWALDir) { Set<ServerName> liveServersFromWALDir) {
deadServersFromPE.forEach(deadservers::add); deadServersFromPE.forEach(deadservers::add);
liveServersFromWALDir.stream().filter(sn -> !onlineServers.containsKey(sn)) liveServersFromWALDir.stream().filter(sn -> !onlineServers.containsKey(sn))
@ -575,7 +575,7 @@ public class ServerManager {
public void moveFromOnlineToDeadServers(final ServerName sn) { public void moveFromOnlineToDeadServers(final ServerName sn) {
synchronized (onlineServers) { synchronized (onlineServers) {
if (!this.onlineServers.containsKey(sn)) { if (!this.onlineServers.containsKey(sn)) {
LOG.warn("Expiration of " + sn + " but server not online"); LOG.trace("Expiration of {} but server not online", sn);
} }
// Remove the server from the known servers lists and update load info BUT // Remove the server from the known servers lists and update load info BUT
// add to deadservers first; do this so it'll show in dead servers list if // add to deadservers first; do this so it'll show in dead servers list if

View File

@ -174,7 +174,7 @@ public class SplitLogManager {
} }
FileStatus[] logfiles = FSUtils.listStatus(fs, logDir, filter); FileStatus[] logfiles = FSUtils.listStatus(fs, logDir, filter);
if (logfiles == null || logfiles.length == 0) { if (logfiles == null || logfiles.length == 0) {
LOG.info(logDir + " is empty dir, no logs to split"); LOG.info(logDir + " dir is empty, no logs to split.");
} else { } else {
Collections.addAll(fileStatus, logfiles); Collections.addAll(fileStatus, logfiles);
} }
@ -235,29 +235,33 @@ public class SplitLogManager {
PathFilter filter) throws IOException { PathFilter filter) throws IOException {
MonitoredTask status = TaskMonitor.get().createStatus("Doing distributed log split in " + MonitoredTask status = TaskMonitor.get().createStatus("Doing distributed log split in " +
logDirs + " for serverName=" + serverNames); logDirs + " for serverName=" + serverNames);
FileStatus[] logfiles = getFileList(logDirs, filter);
status.setStatus("Checking directory contents...");
SplitLogCounters.tot_mgr_log_split_batch_start.increment();
LOG.info("Started splitting " + logfiles.length + " logs in " + logDirs +
" for " + serverNames);
long t = EnvironmentEdgeManager.currentTime();
long totalSize = 0; long totalSize = 0;
TaskBatch batch = new TaskBatch(); TaskBatch batch = null;
for (FileStatus lf : logfiles) { long startTime = 0;
// TODO If the log file is still being written to - which is most likely FileStatus[] logfiles = getFileList(logDirs, filter);
// the case for the last log file - then its length will show up here if (logfiles.length != 0) {
// as zero. The size of such a file can only be retrieved after status.setStatus("Checking directory contents...");
// recover-lease is done. totalSize will be under in most cases and the SplitLogCounters.tot_mgr_log_split_batch_start.increment();
// metrics that it drives will also be under-reported. LOG.info("Started splitting " + logfiles.length + " logs in " + logDirs +
totalSize += lf.getLen(); " for " + serverNames);
String pathToLog = FSUtils.removeWALRootPath(lf.getPath(), conf); startTime = EnvironmentEdgeManager.currentTime();
if (!enqueueSplitTask(pathToLog, batch)) { batch = new TaskBatch();
throw new IOException("duplicate log split scheduled for " + lf.getPath()); for (FileStatus lf : logfiles) {
// TODO If the log file is still being written to - which is most likely
// the case for the last log file - then its length will show up here
// as zero. The size of such a file can only be retrieved after
// recover-lease is done. totalSize will be under in most cases and the
// metrics that it drives will also be under-reported.
totalSize += lf.getLen();
String pathToLog = FSUtils.removeWALRootPath(lf.getPath(), conf);
if (!enqueueSplitTask(pathToLog, batch)) {
throw new IOException("duplicate log split scheduled for " + lf.getPath());
}
} }
waitForSplittingCompletion(batch, status);
} }
waitForSplittingCompletion(batch, status);
if (batch.done != batch.installed) { if (batch != null && batch.done != batch.installed) {
batch.isDead = true; batch.isDead = true;
SplitLogCounters.tot_mgr_log_split_batch_err.increment(); SplitLogCounters.tot_mgr_log_split_batch_err.increment();
LOG.warn("error while splitting logs in " + logDirs + " installed = " + batch.installed LOG.warn("error while splitting logs in " + logDirs + " installed = " + batch.installed
@ -285,10 +289,10 @@ public class SplitLogManager {
} }
SplitLogCounters.tot_mgr_log_split_batch_success.increment(); SplitLogCounters.tot_mgr_log_split_batch_success.increment();
} }
String msg = String msg = "Finished splitting (more than or equal to) " + totalSize +
"finished splitting (more than or equal to) " + totalSize + " bytes in " + batch.installed " bytes in " + ((batch == null)? 0: batch.installed) +
+ " log files in " + logDirs + " in " " log files in " + logDirs + " in " +
+ (EnvironmentEdgeManager.currentTime() - t) + "ms"; ((startTime == -1)? startTime: (EnvironmentEdgeManager.currentTime() - startTime)) + "ms";
status.markComplete(msg); status.markComplete(msg);
LOG.info(msg); LOG.info(msg);
return totalSize; return totalSize;
@ -448,7 +452,7 @@ public class SplitLogManager {
} }
deadWorkers.add(workerName); deadWorkers.add(workerName);
} }
LOG.info("dead splitlog worker " + workerName); LOG.info("Dead splitlog worker {}", workerName);
} }
void handleDeadWorkers(Set<ServerName> serverNames) { void handleDeadWorkers(Set<ServerName> serverNames) {