HBASE-3836 Add facility to track currently progressing actions and workflows.

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1098933 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Todd Lipcon 2011-05-03 06:11:06 +00:00
parent 8341e4a3be
commit ca952ac00b
16 changed files with 724 additions and 78 deletions

View File

@ -233,6 +233,8 @@ Release 0.91.0 - Unreleased
(Subbu M. Iyer via Stack) (Subbu M. Iyer via Stack)
HBASE-1364 [performance] Distributed splitting of regionserver commit logs HBASE-1364 [performance] Distributed splitting of regionserver commit logs
(Prakash Khemani) (Prakash Khemani)
HBASE-3836 Add facility to track currently progressing actions and
workflows. (todd)
Release 0.90.3 - Unreleased Release 0.90.3 - Unreleased

View File

@ -25,6 +25,7 @@ import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.hbase.Server; import org.apache.hadoop.hbase.Server;
import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.zookeeper.ZKUtil; import org.apache.hadoop.hbase.zookeeper.ZKUtil;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener; import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener;
@ -119,17 +120,20 @@ class ActiveMasterManager extends ZooKeeperListener {
* *
* This also makes sure that we are watching the master znode so will be * This also makes sure that we are watching the master znode so will be
* notified if another master dies. * notified if another master dies.
* @param startupStatus
* @return True if no issue becoming active master else false if another * @return True if no issue becoming active master else false if another
* master was running or if some other problem (zookeeper, stop flag has been * master was running or if some other problem (zookeeper, stop flag has been
* set on this Master) * set on this Master)
*/ */
boolean blockUntilBecomingActiveMaster() { boolean blockUntilBecomingActiveMaster(MonitoredTask startupStatus) {
startupStatus.setStatus("Trying to register in ZK as active master");
boolean cleanSetOfActiveMaster = true; boolean cleanSetOfActiveMaster = true;
// Try to become the active master, watch if there is another master // Try to become the active master, watch if there is another master
try { try {
if (ZKUtil.createEphemeralNodeAndWatch(this.watcher, if (ZKUtil.createEphemeralNodeAndWatch(this.watcher,
this.watcher.masterAddressZNode, Bytes.toBytes(this.sn.toString()))) { this.watcher.masterAddressZNode, Bytes.toBytes(this.sn.toString()))) {
// We are the master, return // We are the master, return
startupStatus.setStatus("Successfully registered as active master.");
this.clusterHasActiveMaster.set(true); this.clusterHasActiveMaster.set(true);
LOG.info("Master=" + this.sn); LOG.info("Master=" + this.sn);
return cleanSetOfActiveMaster; return cleanSetOfActiveMaster;
@ -143,13 +147,17 @@ class ActiveMasterManager extends ZooKeeperListener {
ZKUtil.getDataAndWatch(this.watcher, this.watcher.masterAddressZNode); ZKUtil.getDataAndWatch(this.watcher, this.watcher.masterAddressZNode);
ServerName currentMaster = new ServerName(Bytes.toString(bytes)); ServerName currentMaster = new ServerName(Bytes.toString(bytes));
if (ServerName.isSameHostnameAndPort(currentMaster, this.sn)) { if (ServerName.isSameHostnameAndPort(currentMaster, this.sn)) {
LOG.info("Current master has this master's address, " + currentMaster + String msg = ("Current master has this master's address, " + currentMaster +
"; master was restarted? Waiting on znode to expire..."); "; master was restarted? Waiting on znode to expire...");
LOG.info(msg);
startupStatus.setStatus(msg);
// Hurry along the expiration of the znode. // Hurry along the expiration of the znode.
ZKUtil.deleteNode(this.watcher, this.watcher.masterAddressZNode); ZKUtil.deleteNode(this.watcher, this.watcher.masterAddressZNode);
} else { } else {
LOG.info("Another master is the active master, " + currentMaster + String msg = "Another master is the active master, " + currentMaster +
"; waiting to become the next active master"); "; waiting to become the next active master";
LOG.info(msg);
startupStatus.setStatus(msg);
} }
} catch (KeeperException ke) { } catch (KeeperException ke) {
master.abort("Received an unexpected KeeperException, aborting", ke); master.abort("Received an unexpected KeeperException, aborting", ke);
@ -168,7 +176,7 @@ class ActiveMasterManager extends ZooKeeperListener {
return cleanSetOfActiveMaster; return cleanSetOfActiveMaster;
} }
// Try to become active master again now that there is no active master // Try to become active master again now that there is no active master
blockUntilBecomingActiveMaster(); blockUntilBecomingActiveMaster(startupStatus);
} }
return cleanSetOfActiveMaster; return cleanSetOfActiveMaster;
} }

View File

@ -71,6 +71,8 @@ import org.apache.hadoop.hbase.master.handler.TableAddFamilyHandler;
import org.apache.hadoop.hbase.master.handler.TableDeleteFamilyHandler; import org.apache.hadoop.hbase.master.handler.TableDeleteFamilyHandler;
import org.apache.hadoop.hbase.master.handler.TableModifyFamilyHandler; import org.apache.hadoop.hbase.master.handler.TableModifyFamilyHandler;
import org.apache.hadoop.hbase.master.metrics.MasterMetrics; import org.apache.hadoop.hbase.master.metrics.MasterMetrics;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.monitoring.TaskMonitor;
import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.replication.regionserver.Replication; import org.apache.hadoop.hbase.replication.regionserver.Replication;
import org.apache.hadoop.hbase.security.User; import org.apache.hadoop.hbase.security.User;
@ -271,6 +273,9 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
*/ */
@Override @Override
public void run() { public void run() {
MonitoredTask startupStatus =
TaskMonitor.get().createStatus("Master startup");
startupStatus.setDescription("Master startup");
try { try {
/* /*
* Block on becoming the active master. * Block on becoming the active master.
@ -282,16 +287,18 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
* now wait until it dies to try and become the next active master. If we * now wait until it dies to try and become the next active master. If we
* do not succeed on our first attempt, this is no longer a cluster startup. * do not succeed on our first attempt, this is no longer a cluster startup.
*/ */
becomeActiveMaster(); becomeActiveMaster(startupStatus);
// We are either the active master or we were asked to shutdown // We are either the active master or we were asked to shutdown
if (!this.stopped) { if (!this.stopped) {
finishInitialization(); finishInitialization(startupStatus);
loop(); loop();
} }
} catch (Throwable t) { } catch (Throwable t) {
abort("Unhandled exception. Starting shutdown.", t); abort("Unhandled exception. Starting shutdown.", t);
} finally { } finally {
startupStatus.cleanup();
stopChores(); stopChores();
// Wait for all the remaining region servers to report in IFF we were // Wait for all the remaining region servers to report in IFF we were
// running a cluster shutdown AND we were NOT aborting. // running a cluster shutdown AND we were NOT aborting.
@ -313,17 +320,19 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
/** /**
* Try becoming active master. * Try becoming active master.
* @param startupStatus
* @return True if we could successfully become the active master. * @return True if we could successfully become the active master.
* @throws InterruptedException * @throws InterruptedException
*/ */
private boolean becomeActiveMaster() throws InterruptedException { private boolean becomeActiveMaster(MonitoredTask startupStatus)
throws InterruptedException {
// TODO: This is wrong!!!! Should have new servername if we restart ourselves, // TODO: This is wrong!!!! Should have new servername if we restart ourselves,
// if we come back to life. // if we come back to life.
this.activeMasterManager = new ActiveMasterManager(zooKeeper, this.serverName, this.activeMasterManager = new ActiveMasterManager(zooKeeper, this.serverName,
this); this);
this.zooKeeper.registerListener(activeMasterManager); this.zooKeeper.registerListener(activeMasterManager);
stallIfBackupMaster(this.conf, this.activeMasterManager); stallIfBackupMaster(this.conf, this.activeMasterManager);
return this.activeMasterManager.blockUntilBecomingActiveMaster(); return this.activeMasterManager.blockUntilBecomingActiveMaster(startupStatus);
} }
/** /**
@ -386,7 +395,7 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
* @throws InterruptedException * @throws InterruptedException
* @throws KeeperException * @throws KeeperException
*/ */
private void finishInitialization() private void finishInitialization(MonitoredTask status)
throws IOException, InterruptedException, KeeperException { throws IOException, InterruptedException, KeeperException {
isActiveMaster = true; isActiveMaster = true;
@ -397,9 +406,12 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
* below after we determine if cluster startup or failover. * below after we determine if cluster startup or failover.
*/ */
status.setStatus("Initializing Master file system");
// TODO: Do this using Dependency Injection, using PicoContainer, Guice or Spring. // TODO: Do this using Dependency Injection, using PicoContainer, Guice or Spring.
this.fileSystemManager = new MasterFileSystem(this, metrics); this.fileSystemManager = new MasterFileSystem(this, metrics);
// publish cluster ID // publish cluster ID
status.setStatus("Publishing Cluster ID in ZooKeeper");
ClusterId.setClusterId(this.zooKeeper, ClusterId.setClusterId(this.zooKeeper,
fileSystemManager.getClusterId()); fileSystemManager.getClusterId());
@ -407,16 +419,19 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
this.serverManager = new ServerManager(this, this); this.serverManager = new ServerManager(this, this);
status.setStatus("Initializing ZK system trackers");
initializeZKBasedSystemTrackers(); initializeZKBasedSystemTrackers();
// initialize master side coprocessors before we start handling requests // initialize master side coprocessors before we start handling requests
status.setStatus("Initializing master coprocessors");
this.cpHost = new MasterCoprocessorHost(this, this.conf); this.cpHost = new MasterCoprocessorHost(this, this.conf);
// start up all service threads. // start up all service threads.
status.setStatus("Initializing master service threads");
startServiceThreads(); startServiceThreads();
// Wait for region servers to report in. // Wait for region servers to report in.
this.serverManager.waitForRegionServers(); this.serverManager.waitForRegionServers(status);
// Check zk for regionservers that are up but didn't register // Check zk for regionservers that are up but didn't register
for (ServerName sn: this.regionServerTracker.getOnlineServers()) { for (ServerName sn: this.regionServerTracker.getOnlineServers()) {
if (!this.serverManager.isServerOnline(sn)) { if (!this.serverManager.isServerOnline(sn)) {
@ -427,20 +442,25 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
} }
// TODO: Should do this in background rather than block master startup // TODO: Should do this in background rather than block master startup
status.setStatus("Splitting logs after master startup");
this.fileSystemManager. this.fileSystemManager.
splitLogAfterStartup(this.serverManager.getOnlineServers().keySet()); splitLogAfterStartup(this.serverManager.getOnlineServers().keySet());
// Make sure root and meta assigned before proceeding. // Make sure root and meta assigned before proceeding.
assignRootAndMeta(); assignRootAndMeta(status);
// Fixup assignment manager status // Fixup assignment manager status
status.setStatus("Starting assignment manager");
this.assignmentManager.joinCluster(); this.assignmentManager.joinCluster();
// Start balancer and meta catalog janitor after meta and regions have // Start balancer and meta catalog janitor after meta and regions have
// been assigned. // been assigned.
status.setStatus("Starting balancer and catalog janitor");
this.balancerChore = getAndStartBalancerChore(this); this.balancerChore = getAndStartBalancerChore(this);
this.catalogJanitorChore = this.catalogJanitorChore =
Threads.setDaemonThreadRunning(new CatalogJanitor(this, this)); Threads.setDaemonThreadRunning(new CatalogJanitor(this, this));
status.markComplete("Initialization successful");
LOG.info("Master has completed initialization"); LOG.info("Master has completed initialization");
initialized = true; initialized = true;
} }
@ -453,12 +473,13 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
* @throws KeeperException * @throws KeeperException
* @return Count of regions we assigned. * @return Count of regions we assigned.
*/ */
int assignRootAndMeta() int assignRootAndMeta(MonitoredTask status)
throws InterruptedException, IOException, KeeperException { throws InterruptedException, IOException, KeeperException {
int assigned = 0; int assigned = 0;
long timeout = this.conf.getLong("hbase.catalog.verification.timeout", 1000); long timeout = this.conf.getLong("hbase.catalog.verification.timeout", 1000);
// Work on ROOT region. Is it in zk in transition? // Work on ROOT region. Is it in zk in transition?
status.setStatus("Assigning ROOT region");
boolean rit = this.assignmentManager. boolean rit = this.assignmentManager.
processRegionInTransitionAndBlockUntilAssigned(HRegionInfo.ROOT_REGIONINFO); processRegionInTransitionAndBlockUntilAssigned(HRegionInfo.ROOT_REGIONINFO);
if (!catalogTracker.verifyRootRegionLocation(timeout)) { if (!catalogTracker.verifyRootRegionLocation(timeout)) {
@ -474,6 +495,7 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
", location=" + catalogTracker.getRootLocation()); ", location=" + catalogTracker.getRootLocation());
// Work on meta region // Work on meta region
status.setStatus("Assigning META region");
rit = this.assignmentManager. rit = this.assignmentManager.
processRegionInTransitionAndBlockUntilAssigned(HRegionInfo.FIRST_META_REGIONINFO); processRegionInTransitionAndBlockUntilAssigned(HRegionInfo.FIRST_META_REGIONINFO);
if (!this.catalogTracker.verifyMetaRegionLocation(timeout)) { if (!this.catalogTracker.verifyMetaRegionLocation(timeout)) {
@ -490,6 +512,7 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
} }
LOG.info(".META. assigned=" + assigned + ", rit=" + rit + LOG.info(".META. assigned=" + assigned + ", rit=" + rit +
", location=" + catalogTracker.getMetaLocation()); ", location=" + catalogTracker.getMetaLocation());
status.setStatus("META and ROOT assigned.");
return assigned; return assigned;
} }
@ -1101,15 +1124,21 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
this.zooKeeper = new ZooKeeperWatcher(conf, MASTER + ":" this.zooKeeper = new ZooKeeperWatcher(conf, MASTER + ":"
+ this.serverName.getPort(), this); + this.serverName.getPort(), this);
if (!becomeActiveMaster()) { MonitoredTask status =
TaskMonitor.get().createStatus("Recovering expired ZK session");
try {
if (!becomeActiveMaster(status)) {
return false; return false;
} }
initializeZKBasedSystemTrackers(); initializeZKBasedSystemTrackers();
// Update in-memory structures to reflect our earlier Root/Meta assignment. // Update in-memory structures to reflect our earlier Root/Meta assignment.
assignRootAndMeta(); assignRootAndMeta(status);
// process RIT if any // process RIT if any
this.assignmentManager.processRegionsInTransition(); this.assignmentManager.processRegionsInTransition();
return true; return true;
} finally {
status.cleanup();
}
} }
/** /**

View File

@ -48,6 +48,7 @@ import org.apache.hadoop.hbase.client.RetriesExhaustedException;
import org.apache.hadoop.hbase.ipc.HRegionInterface; import org.apache.hadoop.hbase.ipc.HRegionInterface;
import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler; import org.apache.hadoop.hbase.master.handler.MetaServerShutdownHandler;
import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler; import org.apache.hadoop.hbase.master.handler.ServerShutdownHandler;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
/** /**
* The ServerManager class manages info about region servers. * The ServerManager class manages info about region servers.
@ -466,7 +467,7 @@ public class ServerManager {
* Waits for the regionservers to report in. * Waits for the regionservers to report in.
* @throws InterruptedException * @throws InterruptedException
*/ */
public void waitForRegionServers() public void waitForRegionServers(MonitoredTask status)
throws InterruptedException { throws InterruptedException {
long interval = this.master.getConfiguration(). long interval = this.master.getConfiguration().
getLong("hbase.master.wait.on.regionservers.interval", 3000); getLong("hbase.master.wait.on.regionservers.interval", 3000);
@ -477,11 +478,15 @@ public class ServerManager {
Thread.sleep(interval); Thread.sleep(interval);
count = countOfRegionServers(); count = countOfRegionServers();
if (count == oldcount && count > 0) break; if (count == oldcount && count > 0) break;
String msg;
if (count == 0) { if (count == 0) {
LOG.info("Waiting on regionserver(s) to checkin"); msg = "Waiting on regionserver(s) to checkin";
} else { } else {
LOG.info("Waiting on regionserver(s) count to settle; currently=" + count); msg = "Waiting on regionserver(s) count to settle; currently=" + count;
} }
LOG.info(msg);
status.setStatus(msg);
oldcount = count; oldcount = count;
} }
} }

View File

@ -35,6 +35,8 @@ import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Chore; import org.apache.hadoop.hbase.Chore;
import org.apache.hadoop.hbase.Stoppable; import org.apache.hadoop.hbase.Stoppable;
import org.apache.hadoop.hbase.master.SplitLogManager.TaskFinisher.Status; import org.apache.hadoop.hbase.master.SplitLogManager.TaskFinisher.Status;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.monitoring.TaskMonitor;
import org.apache.hadoop.hbase.regionserver.SplitLogWorker; import org.apache.hadoop.hbase.regionserver.SplitLogWorker;
import org.apache.hadoop.hbase.regionserver.wal.HLogSplitter; import org.apache.hadoop.hbase.regionserver.wal.HLogSplitter;
import org.apache.hadoop.hbase.regionserver.wal.OrphanHLogAfterSplitException; import org.apache.hadoop.hbase.regionserver.wal.OrphanHLogAfterSplitException;
@ -183,11 +185,18 @@ public class SplitLogManager extends ZooKeeperListener {
LOG.warn(logDir + " doesn't exist. Nothing to do!"); LOG.warn(logDir + " doesn't exist. Nothing to do!");
return 0; return 0;
} }
MonitoredTask status = TaskMonitor.get().createStatus(
"Doing distributed log split in " + logDir);
status.setStatus("Checking directory contents...");
FileStatus[] logfiles = fs.listStatus(logDir); // TODO filter filenames? FileStatus[] logfiles = fs.listStatus(logDir); // TODO filter filenames?
if (logfiles == null || logfiles.length == 0) { if (logfiles == null || logfiles.length == 0) {
LOG.info(logDir + " is empty dir, no logs to split"); LOG.info(logDir + " is empty dir, no logs to split");
return 0; return 0;
} }
status.setStatus("Scheduling batch of logs to split");
tot_mgr_log_split_batch_start.incrementAndGet(); tot_mgr_log_split_batch_start.incrementAndGet();
LOG.info("started splitting logs in " + logDir); LOG.info("started splitting logs in " + logDir);
long t = EnvironmentEdgeManager.currentTimeMillis(); long t = EnvironmentEdgeManager.currentTimeMillis();
@ -205,7 +214,7 @@ public class SplitLogManager extends ZooKeeperListener {
+ lf.getPath()); + lf.getPath());
} }
} }
waitTasks(batch); waitTasks(batch, status);
if (batch.done != batch.installed) { if (batch.done != batch.installed) {
stopTrackingTasks(batch); stopTrackingTasks(batch);
tot_mgr_log_split_batch_err.incrementAndGet(); tot_mgr_log_split_batch_err.incrementAndGet();
@ -214,6 +223,8 @@ public class SplitLogManager extends ZooKeeperListener {
throw new IOException("error or interrupt while splitting logs in " throw new IOException("error or interrupt while splitting logs in "
+ logDir + " Task = " + batch); + logDir + " Task = " + batch);
} }
status.setStatus("Checking for orphaned logs in log directory...");
if (anyNewLogFiles(logDir, logfiles)) { if (anyNewLogFiles(logDir, logfiles)) {
tot_mgr_new_unexpected_hlogs.incrementAndGet(); tot_mgr_new_unexpected_hlogs.incrementAndGet();
LOG.warn("new hlogs were produced while logs in " + logDir + LOG.warn("new hlogs were produced while logs in " + logDir +
@ -221,12 +232,18 @@ public class SplitLogManager extends ZooKeeperListener {
throw new OrphanHLogAfterSplitException(); throw new OrphanHLogAfterSplitException();
} }
tot_mgr_log_split_batch_success.incrementAndGet(); tot_mgr_log_split_batch_success.incrementAndGet();
status.setStatus("Cleaning up log directory...");
if (!fs.delete(logDir, true)) { if (!fs.delete(logDir, true)) {
throw new IOException("Unable to delete src dir: " + logDir); throw new IOException("Unable to delete src dir: " + logDir);
} }
LOG.info("finished splitting (more than or equal to) " + totalSize +
String msg = "finished splitting (more than or equal to) " + totalSize +
" bytes in " + batch.installed + " log files in " + logDir + " in " + " bytes in " + batch.installed + " log files in " + logDir + " in " +
(EnvironmentEdgeManager.currentTimeMillis() - t) + "ms"); (EnvironmentEdgeManager.currentTimeMillis() - t) + "ms";
status.markComplete(msg);
LOG.info(msg);
return totalSize; return totalSize;
} }
@ -244,10 +261,14 @@ public class SplitLogManager extends ZooKeeperListener {
return false; return false;
} }
private void waitTasks(TaskBatch batch) { private void waitTasks(TaskBatch batch, MonitoredTask status) {
synchronized (batch) { synchronized (batch) {
while ((batch.done + batch.error) != batch.installed) { while ((batch.done + batch.error) != batch.installed) {
try { try {
status.setStatus("Waiting for distributed tasks to finish. "
+ " scheduled=" + batch.installed
+ " done=" + batch.done
+ " error=" + batch.error);
batch.wait(100); batch.wait(100);
if (stopper.isStopped()) { if (stopper.isStopped()) {
LOG.warn("Stopped while waiting for log splits to be completed"); LOG.warn("Stopped while waiting for log splits to be completed");

View File

@ -0,0 +1,53 @@
/**
* Copyright 2011 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.monitoring;
public interface MonitoredTask {
enum State {
RUNNING,
COMPLETE,
ABORTED;
}
public abstract long getStartTime();
public abstract String getDescription();
public abstract String getStatus();
public abstract State getState();
public abstract long getCompletionTimestamp();
public abstract void markComplete(String msg);
public abstract void abort(String msg);
public abstract void setStatus(String status);
public abstract void setDescription(String description);
/**
* Explicitly mark this status as able to be cleaned up,
* even though it might not be complete.
*/
public abstract void cleanup();
}

View File

@ -0,0 +1,102 @@
/**
* Copyright 2011 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.monitoring;
import com.google.common.annotations.VisibleForTesting;
class MonitoredTaskImpl implements MonitoredTask {
private long startTime;
private long completionTimestamp = -1;
private String status;
private String description;
private State state = State.RUNNING;
public MonitoredTaskImpl() {
startTime = System.currentTimeMillis();
}
@Override
public long getStartTime() {
return startTime;
}
@Override
public String getDescription() {
return description;
}
@Override
public String getStatus() {
return status;
}
@Override
public State getState() {
return state;
}
@Override
public long getCompletionTimestamp() {
return completionTimestamp;
}
@Override
public void markComplete(String status) {
state = State.COMPLETE;
setStatus(status);
completionTimestamp = System.currentTimeMillis();
}
@Override
public void abort(String msg) {
setStatus(msg);
state = State.ABORTED;
completionTimestamp = System.currentTimeMillis();
}
@Override
public void setStatus(String status) {
this.status = status;
}
@Override
public void setDescription(String description) {
this.description = description;
}
@Override
public void cleanup() {
if (state == State.RUNNING) {
state = State.ABORTED;
completionTimestamp = System.currentTimeMillis();
}
}
/**
* Force the completion timestamp backwards so that
* it expires now.
*/
@VisibleForTesting
void expireNow() {
completionTimestamp -= 180 * 1000;
}
}

View File

@ -0,0 +1,176 @@
/**
* Copyright 2011 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.monitoring;
import java.lang.ref.WeakReference;
import java.lang.reflect.InvocationHandler;
import java.lang.reflect.Method;
import java.lang.reflect.Proxy;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.collect.Lists;
/**
* Singleton which keeps track of tasks going on in this VM.
* A Task here is anything which takes more than a few seconds
* and the user might want to inquire about the status
*/
public class TaskMonitor {
private static final Log LOG = LogFactory.getLog(TaskMonitor.class);
// Don't keep around any tasks that have completed more than
// 60 seconds ago
private static final long EXPIRATION_TIME = 60*1000;
@VisibleForTesting
static final int MAX_TASKS = 1000;
private static TaskMonitor instance;
private List<TaskAndWeakRefPair> tasks =
Lists.newArrayList();
/**
* Get singleton instance.
* TODO this would be better off scoped to a single daemon
*/
public static synchronized TaskMonitor get() {
if (instance == null) {
instance = new TaskMonitor();
}
return instance;
}
public MonitoredTask createStatus(String description) {
MonitoredTask stat = new MonitoredTaskImpl();
stat.setDescription(description);
MonitoredTask proxy = (MonitoredTask) Proxy.newProxyInstance(
stat.getClass().getClassLoader(),
new Class<?>[] { MonitoredTask.class },
new PassthroughInvocationHandler<MonitoredTask>(stat));
TaskAndWeakRefPair pair = new TaskAndWeakRefPair(stat, proxy);
tasks.add(pair);
return proxy;
}
private synchronized void purgeExpiredTasks() {
int size = 0;
for (Iterator<TaskAndWeakRefPair> it = tasks.iterator();
it.hasNext();) {
TaskAndWeakRefPair pair = it.next();
MonitoredTask stat = pair.get();
if (pair.isDead()) {
// The class who constructed this leaked it. So we can
// assume it's done.
if (stat.getState() == MonitoredTaskImpl.State.RUNNING) {
LOG.warn("Status " + stat + " appears to have been leaked");
stat.cleanup();
}
}
if (canPurge(stat)) {
it.remove();
} else {
size++;
}
}
if (size > MAX_TASKS) {
LOG.warn("Too many actions in action monitor! Purging some.");
tasks = tasks.subList(size - MAX_TASKS, size);
}
}
public synchronized List<MonitoredTask> getTasks() {
purgeExpiredTasks();
ArrayList<MonitoredTask> ret = Lists.newArrayListWithCapacity(tasks.size());
for (TaskAndWeakRefPair pair : tasks) {
ret.add(pair.get());
}
return ret;
}
private boolean canPurge(MonitoredTask stat) {
long cts = stat.getCompletionTimestamp();
return (cts > 0 && System.currentTimeMillis() - cts > EXPIRATION_TIME);
}
/**
* This class encapsulates an object as well as a weak reference to a proxy
* that passes through calls to that object. In art form:
* <code>
* Proxy <------------------
* | \
* v \
* PassthroughInvocationHandler | weak reference
* | /
* MonitoredTaskImpl /
* | /
* StatAndWeakRefProxy ------/
*
* Since we only return the Proxy to the creator of the MonitorableStatus,
* this means that they can leak that object, and we'll detect it
* since our weak reference will go null. But, we still have the actual
* object, so we can log it and display it as a leaked (incomplete) action.
*/
private static class TaskAndWeakRefPair {
private MonitoredTask impl;
private WeakReference<MonitoredTask> weakProxy;
public TaskAndWeakRefPair(MonitoredTask stat,
MonitoredTask proxy) {
this.impl = stat;
this.weakProxy = new WeakReference<MonitoredTask>(proxy);
}
public MonitoredTask get() {
return impl;
}
public boolean isDead() {
return weakProxy.get() == null;
}
}
/**
* An InvocationHandler that simply passes through calls to the original object.
*/
private static class PassthroughInvocationHandler<T> implements InvocationHandler {
private T delegatee;
public PassthroughInvocationHandler(T delegatee) {
this.delegatee = delegatee;
}
@Override
public Object invoke(Object proxy, Method method, Object[] args)
throws Throwable {
return method.invoke(delegatee, args);
}
}
}

View File

@ -83,6 +83,8 @@ import org.apache.hadoop.hbase.io.TimeRange;
import org.apache.hadoop.hbase.io.hfile.BlockCache; import org.apache.hadoop.hbase.io.hfile.BlockCache;
import org.apache.hadoop.hbase.ipc.CoprocessorProtocol; import org.apache.hadoop.hbase.ipc.CoprocessorProtocol;
import org.apache.hadoop.hbase.ipc.HBaseRPC; import org.apache.hadoop.hbase.ipc.HBaseRPC;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.monitoring.TaskMonitor;
import org.apache.hadoop.hbase.regionserver.wal.HLog; import org.apache.hadoop.hbase.regionserver.wal.HLog;
import org.apache.hadoop.hbase.regionserver.wal.HLogKey; import org.apache.hadoop.hbase.regionserver.wal.HLogKey;
import org.apache.hadoop.hbase.regionserver.wal.WALEdit; import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
@ -349,7 +351,12 @@ public class HRegion implements HeapSize { // , Writable{
*/ */
public long initialize(final CancelableProgressable reporter) public long initialize(final CancelableProgressable reporter)
throws IOException { throws IOException {
MonitoredTask status = TaskMonitor.get().createStatus(
"Initializing region " + this);
if (coprocessorHost != null) { if (coprocessorHost != null) {
status.setStatus("Running coprocessor pre-open hook");
coprocessorHost.preOpen(); coprocessorHost.preOpen();
} }
// A region can be reopened if failed a split; reset flags // A region can be reopened if failed a split; reset flags
@ -357,14 +364,17 @@ public class HRegion implements HeapSize { // , Writable{
this.closed.set(false); this.closed.set(false);
// Write HRI to a file in case we need to recover .META. // Write HRI to a file in case we need to recover .META.
status.setStatus("Writing region info on filesystem");
checkRegioninfoOnFilesystem(); checkRegioninfoOnFilesystem();
// Remove temporary data left over from old regions // Remove temporary data left over from old regions
status.setStatus("Cleaning up temporary data from old regions");
cleanupTmpDir(); cleanupTmpDir();
// Load in all the HStores. Get maximum seqid. // Load in all the HStores. Get maximum seqid.
long maxSeqId = -1; long maxSeqId = -1;
for (HColumnDescriptor c : this.regionInfo.getTableDesc().getFamilies()) { for (HColumnDescriptor c : this.regionInfo.getTableDesc().getFamilies()) {
status.setStatus("Instantiating store for column family " + c);
Store store = instantiateHStore(this.tableDir, c); Store store = instantiateHStore(this.tableDir, c);
this.stores.put(c.getName(), store); this.stores.put(c.getName(), store);
long storeSeqId = store.getMaxSequenceId(); long storeSeqId = store.getMaxSequenceId();
@ -373,8 +383,10 @@ public class HRegion implements HeapSize { // , Writable{
} }
} }
// Recover any edits if available. // Recover any edits if available.
maxSeqId = replayRecoveredEditsIfAny(this.regiondir, maxSeqId, reporter); maxSeqId = replayRecoveredEditsIfAny(
this.regiondir, maxSeqId, reporter, status);
status.setStatus("Cleaning up detritus from prior splits");
// Get rid of any splits or merges that were lost in-progress. Clean out // Get rid of any splits or merges that were lost in-progress. Clean out
// these directories here on open. We may be opening a region that was // these directories here on open. We may be opening a region that was
// being split but we crashed in the middle of it all. // being split but we crashed in the middle of it all.
@ -390,9 +402,12 @@ public class HRegion implements HeapSize { // , Writable{
long nextSeqid = maxSeqId + 1; long nextSeqid = maxSeqId + 1;
LOG.info("Onlined " + this.toString() + "; next sequenceid=" + nextSeqid); LOG.info("Onlined " + this.toString() + "; next sequenceid=" + nextSeqid);
if (coprocessorHost != null) { if (coprocessorHost != null) {
status.setStatus("Running coprocessor post-open hooks");
coprocessorHost.postOpen(); coprocessorHost.postOpen();
} }
status.markComplete("Region opened successfully");
return nextSeqid; return nextSeqid;
} }
@ -556,12 +571,22 @@ public class HRegion implements HeapSize { // , Writable{
public List<StoreFile> close(final boolean abort) throws IOException { public List<StoreFile> close(final boolean abort) throws IOException {
// Only allow one thread to close at a time. Serialize them so dual // Only allow one thread to close at a time. Serialize them so dual
// threads attempting to close will run up against each other. // threads attempting to close will run up against each other.
MonitoredTask status = TaskMonitor.get().createStatus(
"Closing region " + this +
(abort ? " due to abort" : ""));
status.setStatus("Waiting for close lock");
try {
synchronized (closeLock) { synchronized (closeLock) {
return doClose(abort); return doClose(abort, status);
}
} finally {
status.cleanup();
} }
} }
private List<StoreFile> doClose(final boolean abort) private List<StoreFile> doClose(
final boolean abort, MonitoredTask status)
throws IOException { throws IOException {
if (isClosed()) { if (isClosed()) {
LOG.warn("Region " + this + " already closed"); LOG.warn("Region " + this + " already closed");
@ -569,9 +594,11 @@ public class HRegion implements HeapSize { // , Writable{
} }
if (coprocessorHost != null) { if (coprocessorHost != null) {
status.setStatus("Running coprocessor pre-close hooks");
this.coprocessorHost.preClose(abort); this.coprocessorHost.preClose(abort);
} }
status.setStatus("Disabling compacts and flushes for region");
boolean wasFlushing = false; boolean wasFlushing = false;
synchronized (writestate) { synchronized (writestate) {
// Disable compacting and flushing by background threads for this // Disable compacting and flushing by background threads for this
@ -596,20 +623,24 @@ public class HRegion implements HeapSize { // , Writable{
// that will clear out of the bulk of the memstore before we put up // that will clear out of the bulk of the memstore before we put up
// the close flag? // the close flag?
if (!abort && !wasFlushing && worthPreFlushing()) { if (!abort && !wasFlushing && worthPreFlushing()) {
status.setStatus("Pre-flushing region before close");
LOG.info("Running close preflush of " + this.getRegionNameAsString()); LOG.info("Running close preflush of " + this.getRegionNameAsString());
internalFlushcache(); internalFlushcache(status);
} }
this.closing.set(true); this.closing.set(true);
status.setStatus("Disabling writes for close");
lock.writeLock().lock(); lock.writeLock().lock();
try { try {
if (this.isClosed()) { if (this.isClosed()) {
status.abort("Already got closed by another process");
// SplitTransaction handles the null // SplitTransaction handles the null
return null; return null;
} }
LOG.debug("Updates disabled for region " + this); LOG.debug("Updates disabled for region " + this);
// Don't flush the cache if we are aborting // Don't flush the cache if we are aborting
if (!abort) { if (!abort) {
internalFlushcache(); internalFlushcache(status);
} }
List<StoreFile> result = new ArrayList<StoreFile>(); List<StoreFile> result = new ArrayList<StoreFile>();
@ -619,8 +650,10 @@ public class HRegion implements HeapSize { // , Writable{
this.closed.set(true); this.closed.set(true);
if (coprocessorHost != null) { if (coprocessorHost != null) {
status.setStatus("Running coprocessor post-close hooks");
this.coprocessorHost.postClose(abort); this.coprocessorHost.postClose(abort);
} }
status.markComplete("Closed");
LOG.info("Closed " + this); LOG.info("Closed " + this);
return result; return result;
} finally { } finally {
@ -824,6 +857,8 @@ public class HRegion implements HeapSize { // , Writable{
lock.readLock().lock(); lock.readLock().lock();
this.lastCompactInfo = null; this.lastCompactInfo = null;
byte [] splitRow = null; byte [] splitRow = null;
MonitoredTask status = TaskMonitor.get().createStatus(
"Compacting stores in " + this);
try { try {
if (this.closed.get()) { if (this.closed.get()) {
LOG.debug("Skipping compaction on " + this + " because closed"); LOG.debug("Skipping compaction on " + this + " because closed");
@ -833,6 +868,7 @@ public class HRegion implements HeapSize { // , Writable{
return splitRow; return splitRow;
} }
if (coprocessorHost != null) { if (coprocessorHost != null) {
status.setStatus("Running coprocessor preCompact hooks");
coprocessorHost.preCompact(false); coprocessorHost.preCompact(false);
} }
try { try {
@ -840,9 +876,11 @@ public class HRegion implements HeapSize { // , Writable{
if (!writestate.compacting && writestate.writesEnabled) { if (!writestate.compacting && writestate.writesEnabled) {
writestate.compacting = true; writestate.compacting = true;
} else { } else {
LOG.info("NOT compacting region " + this + String msg = "NOT compacting region " + this +
": compacting=" + writestate.compacting + ", writesEnabled=" + ": compacting=" + writestate.compacting + ", writesEnabled=" +
writestate.writesEnabled); writestate.writesEnabled;
LOG.info(msg);
status.abort(msg);
return splitRow; return splitRow;
} }
} }
@ -852,6 +890,7 @@ public class HRegion implements HeapSize { // , Writable{
long lastCompactSize = 0; long lastCompactSize = 0;
boolean completed = false; boolean completed = false;
try { try {
status.setStatus("Compacting store " + store);
final Store.StoreSize ss = store.compact(); final Store.StoreSize ss = store.compact();
lastCompactSize += store.getLastCompactSize(); lastCompactSize += store.getLastCompactSize();
if (ss != null) { if (ss != null) {
@ -868,6 +907,9 @@ public class HRegion implements HeapSize { // , Writable{
if (completed) { if (completed) {
this.lastCompactInfo = this.lastCompactInfo =
new Pair<Long,Long>((now - startTime) / 1000, lastCompactSize); new Pair<Long,Long>((now - startTime) / 1000, lastCompactSize);
status.setStatus("Compaction complete: " +
StringUtils.humanReadableInt(lastCompactSize) + " in " +
(now - startTime) + "ms");
} }
} }
} finally { } finally {
@ -877,9 +919,13 @@ public class HRegion implements HeapSize { // , Writable{
} }
} }
if (coprocessorHost != null) { if (coprocessorHost != null) {
status.setStatus("Running coprocessor post-compact hooks");
coprocessorHost.postCompact(splitRow != null); coprocessorHost.postCompact(splitRow != null);
} }
status.markComplete("Compaction complete");
} finally { } finally {
status.cleanup();
lock.readLock().unlock(); lock.readLock().unlock();
} }
if (splitRow != null) { if (splitRow != null) {
@ -915,13 +961,17 @@ public class HRegion implements HeapSize { // , Writable{
LOG.debug("Skipping flush on " + this + " because closing"); LOG.debug("Skipping flush on " + this + " because closing");
return false; return false;
} }
MonitoredTask status = TaskMonitor.get().createStatus("Flushing " + this);
status.setStatus("Acquiring readlock on region");
lock.readLock().lock(); lock.readLock().lock();
try { try {
if (this.closed.get()) { if (this.closed.get()) {
LOG.debug("Skipping flush on " + this + " because closed"); LOG.debug("Skipping flush on " + this + " because closed");
status.abort("Skipped: closed");
return false; return false;
} }
if (coprocessorHost != null) { if (coprocessorHost != null) {
status.setStatus("Running coprocessor pre-flush hooks");
coprocessorHost.preFlush(); coprocessorHost.preFlush();
} }
try { try {
@ -935,13 +985,19 @@ public class HRegion implements HeapSize { // , Writable{
writestate.flushing + ", writesEnabled=" + writestate.flushing + ", writesEnabled=" +
writestate.writesEnabled); writestate.writesEnabled);
} }
status.abort("Not flushing since " +
(writestate.flushing ? "already flushing" : "writes not enabled"));
return false; return false;
} }
} }
boolean result = internalFlushcache(); boolean result = internalFlushcache(status);
if (coprocessorHost != null) { if (coprocessorHost != null) {
status.setStatus("Running post-flush coprocessor hooks");
coprocessorHost.postFlush(); coprocessorHost.postFlush();
} }
status.markComplete("Flush successful");
return result; return result;
} finally { } finally {
synchronized (writestate) { synchronized (writestate) {
@ -952,6 +1008,7 @@ public class HRegion implements HeapSize { // , Writable{
} }
} finally { } finally {
lock.readLock().unlock(); lock.readLock().unlock();
status.cleanup();
} }
} }
@ -982,6 +1039,7 @@ public class HRegion implements HeapSize { // , Writable{
* routes. * routes.
* *
* <p> This method may block for some time. * <p> This method may block for some time.
* @param status
* *
* @return true if the region needs compacting * @return true if the region needs compacting
* *
@ -989,19 +1047,21 @@ public class HRegion implements HeapSize { // , Writable{
* @throws DroppedSnapshotException Thrown when replay of hlog is required * @throws DroppedSnapshotException Thrown when replay of hlog is required
* because a Snapshot was not properly persisted. * because a Snapshot was not properly persisted.
*/ */
protected boolean internalFlushcache() throws IOException { protected boolean internalFlushcache(MonitoredTask status) throws IOException {
return internalFlushcache(this.log, -1); return internalFlushcache(this.log, -1, status);
} }
/** /**
* @param wal Null if we're NOT to go via hlog/wal. * @param wal Null if we're NOT to go via hlog/wal.
* @param myseqid The seqid to use if <code>wal</code> is null writing out * @param myseqid The seqid to use if <code>wal</code> is null writing out
* flush file. * flush file.
* @param status
* @return true if the region needs compacting * @return true if the region needs compacting
* @throws IOException * @throws IOException
* @see #internalFlushcache() * @see #internalFlushcache()
*/ */
protected boolean internalFlushcache(final HLog wal, final long myseqid) protected boolean internalFlushcache(
final HLog wal, final long myseqid, MonitoredTask status)
throws IOException { throws IOException {
final long startTime = EnvironmentEdgeManager.currentTimeMillis(); final long startTime = EnvironmentEdgeManager.currentTimeMillis();
// Clear flush flag. // Clear flush flag.
@ -1031,7 +1091,9 @@ public class HRegion implements HeapSize { // , Writable{
// We have to take a write lock during snapshot, or else a write could // We have to take a write lock during snapshot, or else a write could
// end up in both snapshot and memstore (makes it difficult to do atomic // end up in both snapshot and memstore (makes it difficult to do atomic
// rows then) // rows then)
status.setStatus("Obtaining lock to block concurrent updates");
this.updatesLock.writeLock().lock(); this.updatesLock.writeLock().lock();
status.setStatus("Preparing to flush by snapshotting stores");
final long currentMemStoreSize = this.memstoreSize.get(); final long currentMemStoreSize = this.memstoreSize.get();
List<StoreFlusher> storeFlushers = new ArrayList<StoreFlusher>(stores.size()); List<StoreFlusher> storeFlushers = new ArrayList<StoreFlusher>(stores.size());
try { try {
@ -1049,6 +1111,7 @@ public class HRegion implements HeapSize { // , Writable{
} finally { } finally {
this.updatesLock.writeLock().unlock(); this.updatesLock.writeLock().unlock();
} }
status.setStatus("Flushing stores");
LOG.debug("Finished snapshotting, commencing flushing stores"); LOG.debug("Finished snapshotting, commencing flushing stores");
@ -1063,7 +1126,7 @@ public class HRegion implements HeapSize { // , Writable{
// just-made new flush store file. // just-made new flush store file.
for (StoreFlusher flusher : storeFlushers) { for (StoreFlusher flusher : storeFlushers) {
flusher.flushCache(); flusher.flushCache(status);
} }
// Switch snapshot (in memstore) -> new hfile (thus causing // Switch snapshot (in memstore) -> new hfile (thus causing
// all the store scanners to reset/reseek). // all the store scanners to reset/reseek).
@ -1088,6 +1151,7 @@ public class HRegion implements HeapSize { // , Writable{
DroppedSnapshotException dse = new DroppedSnapshotException("region: " + DroppedSnapshotException dse = new DroppedSnapshotException("region: " +
Bytes.toStringBinary(getRegionName())); Bytes.toStringBinary(getRegionName()));
dse.initCause(t); dse.initCause(t);
status.abort("Flush failed: " + StringUtils.stringifyException(t));
throw dse; throw dse;
} }
@ -1111,13 +1175,13 @@ public class HRegion implements HeapSize { // , Writable{
} }
long time = EnvironmentEdgeManager.currentTimeMillis() - startTime; long time = EnvironmentEdgeManager.currentTimeMillis() - startTime;
if (LOG.isDebugEnabled()) { String msg = "Finished memstore flush of ~" +
LOG.info("Finished memstore flush of ~" +
StringUtils.humanReadableInt(currentMemStoreSize) + " for region " + StringUtils.humanReadableInt(currentMemStoreSize) + " for region " +
this + " in " + time + "ms, sequenceid=" + sequenceId + this + " in " + time + "ms, sequenceid=" + sequenceId +
", compaction requested=" + compactionRequested + ", compaction requested=" + compactionRequested +
((wal == null)? "; wal=null": "")); ((wal == null)? "; wal=null": "");
} LOG.info(msg);
status.setStatus(msg);
this.recentFlushes.add(new Pair<Long,Long>(time/1000,currentMemStoreSize)); this.recentFlushes.add(new Pair<Long,Long>(time/1000,currentMemStoreSize));
return compactionRequested; return compactionRequested;
@ -2020,7 +2084,8 @@ public class HRegion implements HeapSize { // , Writable{
* @throws IOException * @throws IOException
*/ */
protected long replayRecoveredEditsIfAny(final Path regiondir, protected long replayRecoveredEditsIfAny(final Path regiondir,
final long minSeqId, final CancelableProgressable reporter) final long minSeqId, final CancelableProgressable reporter,
final MonitoredTask status)
throws UnsupportedEncodingException, IOException { throws UnsupportedEncodingException, IOException {
long seqid = minSeqId; long seqid = minSeqId;
NavigableSet<Path> files = HLog.getSplitEditFilesSorted(this.fs, regiondir); NavigableSet<Path> files = HLog.getSplitEditFilesSorted(this.fs, regiondir);
@ -2046,7 +2111,7 @@ public class HRegion implements HeapSize { // , Writable{
} }
if (seqid > minSeqId) { if (seqid > minSeqId) {
// Then we added some edits to memory. Flush and cleanup split edit files. // Then we added some edits to memory. Flush and cleanup split edit files.
internalFlushcache(null, seqid); internalFlushcache(null, seqid, status);
} }
// Now delete the content of recovered edits. We're done w/ them. // Now delete the content of recovered edits. We're done w/ them.
for (Path file: files) { for (Path file: files) {
@ -2071,7 +2136,11 @@ public class HRegion implements HeapSize { // , Writable{
private long replayRecoveredEdits(final Path edits, private long replayRecoveredEdits(final Path edits,
final long minSeqId, final CancelableProgressable reporter) final long minSeqId, final CancelableProgressable reporter)
throws IOException { throws IOException {
LOG.info("Replaying edits from " + edits + "; minSequenceid=" + minSeqId); String msg = "Replaying edits from " + edits + "; minSequenceid=" + minSeqId;
LOG.info(msg);
MonitoredTask status = TaskMonitor.get().createStatus(msg);
status.setStatus("Opening logs");
HLog.Reader reader = HLog.getReader(this.fs, edits, conf); HLog.Reader reader = HLog.getReader(this.fs, edits, conf);
try { try {
long currentEditSeqId = minSeqId; long currentEditSeqId = minSeqId;
@ -2103,10 +2172,14 @@ public class HRegion implements HeapSize { // , Writable{
intervalEdits = 0; intervalEdits = 0;
long cur = EnvironmentEdgeManager.currentTimeMillis(); long cur = EnvironmentEdgeManager.currentTimeMillis();
if (lastReport + period <= cur) { if (lastReport + period <= cur) {
status.setStatus("Replaying edits..." +
" skipped=" + skippedEdits +
" edits=" + editsCount);
// Timeout reached // Timeout reached
if(!reporter.progress()) { if(!reporter.progress()) {
String msg = "Progressable reporter failed, stopping replay"; msg = "Progressable reporter failed, stopping replay";
LOG.warn(msg); LOG.warn(msg);
status.abort(msg);
throw new IOException(msg); throw new IOException(msg);
} }
lastReport = cur; lastReport = cur;
@ -2117,6 +2190,7 @@ public class HRegion implements HeapSize { // , Writable{
// Start coprocessor replay here. The coprocessor is for each WALEdit // Start coprocessor replay here. The coprocessor is for each WALEdit
// instead of a KeyValue. // instead of a KeyValue.
if (coprocessorHost != null) { if (coprocessorHost != null) {
status.setStatus("Running pre-WAL-restore hook in coprocessors");
if (coprocessorHost.preWALRestore(this.getRegionInfo(), key, val)) { if (coprocessorHost.preWALRestore(this.getRegionInfo(), key, val)) {
// if bypass this log entry, ignore it ... // if bypass this log entry, ignore it ...
continue; continue;
@ -2158,7 +2232,7 @@ public class HRegion implements HeapSize { // , Writable{
flush = restoreEdit(store, kv); flush = restoreEdit(store, kv);
editsCount++; editsCount++;
} }
if (flush) internalFlushcache(null, currentEditSeqId); if (flush) internalFlushcache(null, currentEditSeqId, status);
if (coprocessorHost != null) { if (coprocessorHost != null) {
coprocessorHost.postWALRestore(this.getRegionInfo(), key, val); coprocessorHost.postWALRestore(this.getRegionInfo(), key, val);
@ -2166,30 +2240,39 @@ public class HRegion implements HeapSize { // , Writable{
} }
} catch (EOFException eof) { } catch (EOFException eof) {
Path p = HLog.moveAsideBadEditsFile(fs, edits); Path p = HLog.moveAsideBadEditsFile(fs, edits);
LOG.warn("Encountered EOF. Most likely due to Master failure during " + msg = "Encountered EOF. Most likely due to Master failure during " +
"log spliting, so we have this data in another edit. " + "log spliting, so we have this data in another edit. " +
"Continuing, but renaming " + edits + " as " + p, eof); "Continuing, but renaming " + edits + " as " + p;
LOG.warn(msg, eof);
status.abort(msg);
} catch (IOException ioe) { } catch (IOException ioe) {
// If the IOE resulted from bad file format, // If the IOE resulted from bad file format,
// then this problem is idempotent and retrying won't help // then this problem is idempotent and retrying won't help
if (ioe.getCause() instanceof ParseException) { if (ioe.getCause() instanceof ParseException) {
Path p = HLog.moveAsideBadEditsFile(fs, edits); Path p = HLog.moveAsideBadEditsFile(fs, edits);
LOG.warn("File corruption encountered! " + msg = "File corruption encountered! " +
"Continuing, but renaming " + edits + " as " + p, ioe); "Continuing, but renaming " + edits + " as " + p;
LOG.warn(msg, ioe);
status.setStatus(msg);
} else { } else {
status.abort(StringUtils.stringifyException(ioe));
// other IO errors may be transient (bad network connection, // other IO errors may be transient (bad network connection,
// checksum exception on one datanode, etc). throw & retry // checksum exception on one datanode, etc). throw & retry
throw ioe; throw ioe;
} }
} }
if (LOG.isDebugEnabled()) {
LOG.debug("Applied " + editsCount + ", skipped " + skippedEdits + msg = "Applied " + editsCount + ", skipped " + skippedEdits +
", firstSequenceidInLog=" + firstSeqIdInLog + ", firstSequenceidInLog=" + firstSeqIdInLog +
", maxSequenceidInLog=" + currentEditSeqId); ", maxSequenceidInLog=" + currentEditSeqId;
status.markComplete(msg);
if (LOG.isDebugEnabled()) {
LOG.debug(msg);
} }
return currentEditSeqId; return currentEditSeqId;
} finally { } finally {
reader.close(); reader.close();
status.cleanup();
} }
} }

View File

@ -48,6 +48,7 @@ import org.apache.hadoop.hbase.io.hfile.BlockCache;
import org.apache.hadoop.hbase.io.hfile.Compression; import org.apache.hadoop.hbase.io.hfile.Compression;
import org.apache.hadoop.hbase.io.hfile.HFile; import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.io.hfile.HFileScanner; import org.apache.hadoop.hbase.io.hfile.HFileScanner;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.ClassSize; import org.apache.hadoop.hbase.util.ClassSize;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
@ -448,11 +449,13 @@ public class Store implements HeapSize {
*/ */
private StoreFile flushCache(final long logCacheFlushId, private StoreFile flushCache(final long logCacheFlushId,
SortedSet<KeyValue> snapshot, SortedSet<KeyValue> snapshot,
TimeRangeTracker snapshotTimeRangeTracker) throws IOException { TimeRangeTracker snapshotTimeRangeTracker,
MonitoredTask status) throws IOException {
// If an exception happens flushing, we let it out without clearing // If an exception happens flushing, we let it out without clearing
// the memstore snapshot. The old snapshot will be returned when we say // the memstore snapshot. The old snapshot will be returned when we say
// 'snapshot', the next time flush comes around. // 'snapshot', the next time flush comes around.
return internalFlushCache(snapshot, logCacheFlushId, snapshotTimeRangeTracker); return internalFlushCache(
snapshot, logCacheFlushId, snapshotTimeRangeTracker, status);
} }
/* /*
@ -463,7 +466,8 @@ public class Store implements HeapSize {
*/ */
private StoreFile internalFlushCache(final SortedSet<KeyValue> set, private StoreFile internalFlushCache(final SortedSet<KeyValue> set,
final long logCacheFlushId, final long logCacheFlushId,
TimeRangeTracker snapshotTimeRangeTracker) TimeRangeTracker snapshotTimeRangeTracker,
MonitoredTask status)
throws IOException { throws IOException {
StoreFile.Writer writer = null; StoreFile.Writer writer = null;
long flushed = 0; long flushed = 0;
@ -476,6 +480,7 @@ public class Store implements HeapSize {
// flush to list of store files. Add cleanup of anything put on filesystem // flush to list of store files. Add cleanup of anything put on filesystem
// if we fail. // if we fail.
synchronized (flushLock) { synchronized (flushLock) {
status.setStatus("Flushing " + this + ": creating writer");
// A. Write the map out to the disk // A. Write the map out to the disk
writer = createWriterInTmp(set.size()); writer = createWriterInTmp(set.size());
writer.setTimeRangeTracker(snapshotTimeRangeTracker); writer.setTimeRangeTracker(snapshotTimeRangeTracker);
@ -491,18 +496,23 @@ public class Store implements HeapSize {
} finally { } finally {
// Write out the log sequence number that corresponds to this output // Write out the log sequence number that corresponds to this output
// hfile. The hfile is current up to and including logCacheFlushId. // hfile. The hfile is current up to and including logCacheFlushId.
status.setStatus("Flushing " + this + ": appending metadata");
writer.appendMetadata(logCacheFlushId, false); writer.appendMetadata(logCacheFlushId, false);
status.setStatus("Flushing " + this + ": closing flushed file");
writer.close(); writer.close();
} }
} }
// Write-out finished successfully, move into the right spot // Write-out finished successfully, move into the right spot
Path dstPath = StoreFile.getUniqueFile(fs, homedir); Path dstPath = StoreFile.getUniqueFile(fs, homedir);
LOG.info("Renaming flushed file at " + writer.getPath() + " to " + dstPath); String msg = "Renaming flushed file at " + writer.getPath() + " to " + dstPath;
LOG.info(msg);
status.setStatus("Flushing " + this + ": " + msg);
if (!fs.rename(writer.getPath(), dstPath)) { if (!fs.rename(writer.getPath(), dstPath)) {
LOG.warn("Unable to rename " + writer.getPath() + " to " + dstPath); LOG.warn("Unable to rename " + writer.getPath() + " to " + dstPath);
} }
status.setStatus("Flushing " + this + ": reopening flushed file");
StoreFile sf = new StoreFile(this.fs, dstPath, blockcache, StoreFile sf = new StoreFile(this.fs, dstPath, blockcache,
this.conf, this.family.getBloomFilterType(), this.inMemory); this.conf, this.family.getBloomFilterType(), this.inMemory);
StoreFile.Reader r = sf.createReader(); StoreFile.Reader r = sf.createReader();
@ -1593,8 +1603,9 @@ public class Store implements HeapSize {
} }
@Override @Override
public void flushCache() throws IOException { public void flushCache(MonitoredTask status) throws IOException {
storeFile = Store.this.flushCache(cacheFlushId, snapshot, snapshotTimeRangeTracker); storeFile = Store.this.flushCache(
cacheFlushId, snapshot, snapshotTimeRangeTracker, status);
} }
@Override @Override

View File

@ -21,6 +21,8 @@ package org.apache.hadoop.hbase.regionserver;
import java.io.IOException; import java.io.IOException;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
/** /**
* A package protected interface for a store flushing. * A package protected interface for a store flushing.
* A store flusher carries the state required to prepare/flush/commit the * A store flusher carries the state required to prepare/flush/commit the
@ -45,7 +47,7 @@ interface StoreFlusher {
* *
* @throws IOException in case the flush fails * @throws IOException in case the flush fails
*/ */
void flushCache() throws IOException; void flushCache(MonitoredTask status) throws IOException;
/** /**
* Commit the flush - add the store file to the store and clear the * Commit the flush - add the store file to the store and clear the

View File

@ -46,6 +46,8 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.io.HeapSize; import org.apache.hadoop.hbase.io.HeapSize;
import org.apache.hadoop.hbase.master.SplitLogManager.TaskFinisher.Status; import org.apache.hadoop.hbase.master.SplitLogManager.TaskFinisher.Status;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.monitoring.TaskMonitor;
import org.apache.hadoop.hbase.HConstants; import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.RemoteExceptionHandler; import org.apache.hadoop.hbase.RemoteExceptionHandler;
@ -70,7 +72,6 @@ import com.google.common.collect.Lists;
* region to replay on startup. Delete the old log files when finished. * region to replay on startup. Delete the old log files when finished.
*/ */
public class HLogSplitter { public class HLogSplitter {
private static final String LOG_SPLITTER_IMPL = "hbase.hlog.splitter.impl"; private static final String LOG_SPLITTER_IMPL = "hbase.hlog.splitter.impl";
/** /**
@ -107,6 +108,8 @@ public class HLogSplitter {
// consumed by the reader thread, or an exception occurred // consumed by the reader thread, or an exception occurred
Object dataAvailable = new Object(); Object dataAvailable = new Object();
private MonitoredTask status;
/** /**
* Create a new HLogSplitter using the given {@link Configuration} and the * Create a new HLogSplitter using the given {@link Configuration} and the
@ -179,10 +182,16 @@ public class HLogSplitter {
"An HLogSplitter instance may only be used once"); "An HLogSplitter instance may only be used once");
hasSplit = true; hasSplit = true;
status = TaskMonitor.get().createStatus(
"Splitting logs in " + srcDir);
long startTime = EnvironmentEdgeManager.currentTimeMillis(); long startTime = EnvironmentEdgeManager.currentTimeMillis();
status.setStatus("Determining files to split...");
List<Path> splits = null; List<Path> splits = null;
if (!fs.exists(srcDir)) { if (!fs.exists(srcDir)) {
// Nothing to do // Nothing to do
status.markComplete("No log directory existed to split.");
return splits; return splits;
} }
FileStatus[] logfiles = fs.listStatus(srcDir); FileStatus[] logfiles = fs.listStatus(srcDir);
@ -190,16 +199,21 @@ public class HLogSplitter {
// Nothing to do // Nothing to do
return splits; return splits;
} }
LOG.info("Splitting " + logfiles.length + " hlog(s) in " logAndReport("Splitting " + logfiles.length + " hlog(s) in "
+ srcDir.toString()); + srcDir.toString());
splits = splitLog(logfiles); splits = splitLog(logfiles);
splitTime = EnvironmentEdgeManager.currentTimeMillis() - startTime; splitTime = EnvironmentEdgeManager.currentTimeMillis() - startTime;
LOG.info("hlog file splitting completed in " + splitTime + logAndReport("hlog file splitting completed in " + splitTime +
" ms for " + srcDir.toString()); " ms for " + srcDir.toString());
return splits; return splits;
} }
private void logAndReport(String msg) {
status.setStatus(msg);
LOG.info(msg);
}
/** /**
* @return time that this split took * @return time that this split took
*/ */
@ -252,6 +266,7 @@ public class HLogSplitter {
boolean skipErrors = conf.getBoolean("hbase.hlog.split.skip.errors", true); boolean skipErrors = conf.getBoolean("hbase.hlog.split.skip.errors", true);
long totalBytesToSplit = countTotalBytes(logfiles);
splitSize = 0; splitSize = 0;
outputSink.startWriterThreads(entryBuffers); outputSink.startWriterThreads(entryBuffers);
@ -262,7 +277,7 @@ public class HLogSplitter {
Path logPath = log.getPath(); Path logPath = log.getPath();
long logLength = log.getLen(); long logLength = log.getLen();
splitSize += logLength; splitSize += logLength;
LOG.debug("Splitting hlog " + (i++ + 1) + " of " + logfiles.length logAndReport("Splitting hlog " + (i++ + 1) + " of " + logfiles.length
+ ": " + logPath + ", length=" + logLength); + ": " + logPath + ", length=" + logLength);
Reader in; Reader in;
try { try {
@ -284,19 +299,35 @@ public class HLogSplitter {
continue; continue;
} }
} }
status.setStatus("Log splits complete. Checking for orphaned logs.");
if (fs.listStatus(srcDir).length > processedLogs.size() if (fs.listStatus(srcDir).length > processedLogs.size()
+ corruptedLogs.size()) { + corruptedLogs.size()) {
throw new OrphanHLogAfterSplitException( throw new OrphanHLogAfterSplitException(
"Discovered orphan hlog after split. Maybe the " "Discovered orphan hlog after split. Maybe the "
+ "HRegionServer was not dead when we started"); + "HRegionServer was not dead when we started");
} }
status.setStatus("Archiving logs after completed split");
archiveLogs(srcDir, corruptedLogs, processedLogs, oldLogDir, fs, conf); archiveLogs(srcDir, corruptedLogs, processedLogs, oldLogDir, fs, conf);
} finally { } finally {
status.setStatus("Finishing writing output logs and closing down.");
splits = outputSink.finishWritingAndClose(); splits = outputSink.finishWritingAndClose();
} }
return splits; return splits;
} }
/**
* @return the total size of the passed list of files.
*/
private static long countTotalBytes(FileStatus[] logfiles) {
long ret = 0;
for (FileStatus stat : logfiles) {
ret += stat.getLen();
}
return ret;
}
/** /**
* Splits a HLog file into a temporary staging area. tmpname is used to build * Splits a HLog file into a temporary staging area. tmpname is used to build
* the name of the staging area where the recovered-edits will be separated * the name of the staging area where the recovered-edits will be separated
@ -329,6 +360,11 @@ public class HLogSplitter {
synchronizedMap(new TreeMap<byte[], Object>(Bytes.BYTES_COMPARATOR)); synchronizedMap(new TreeMap<byte[], Object>(Bytes.BYTES_COMPARATOR));
boolean isCorrupted = false; boolean isCorrupted = false;
Preconditions.checkState(status == null);
status = TaskMonitor.get().createStatus(
"Splitting log file " + logfile.getPath() +
"into a temporary staging area.");
Object BAD_WRITER = new Object(); Object BAD_WRITER = new Object();
boolean progress_failed = false; boolean progress_failed = false;
@ -342,6 +378,7 @@ public class HLogSplitter {
Path logPath = logfile.getPath(); Path logPath = logfile.getPath();
long logLength = logfile.getLen(); long logLength = logfile.getLen();
LOG.info("Splitting hlog: " + logPath + ", length=" + logLength); LOG.info("Splitting hlog: " + logPath + ", length=" + logLength);
status.setStatus("Opening log file");
Reader in = null; Reader in = null;
try { try {
in = getReader(fs, logfile, conf, skipErrors); in = getReader(fs, logfile, conf, skipErrors);
@ -351,12 +388,14 @@ public class HLogSplitter {
isCorrupted = true; isCorrupted = true;
} }
if (in == null) { if (in == null) {
status.markComplete("Was nothing to split in log file");
LOG.warn("Nothing to split in log file " + logPath); LOG.warn("Nothing to split in log file " + logPath);
return true; return true;
} }
long t = EnvironmentEdgeManager.currentTimeMillis(); long t = EnvironmentEdgeManager.currentTimeMillis();
long last_report_at = t; long last_report_at = t;
if (reporter != null && reporter.progress() == false) { if (reporter != null && reporter.progress() == false) {
status.markComplete("Failed: reporter.progress asked us to terminate");
return false; return false;
} }
int editsCount = 0; int editsCount = 0;
@ -380,10 +419,12 @@ public class HLogSplitter {
wap.w.append(entry); wap.w.append(entry);
editsCount++; editsCount++;
if (editsCount % interval == 0) { if (editsCount % interval == 0) {
status.setStatus("Split " + editsCount + " edits");
long t1 = EnvironmentEdgeManager.currentTimeMillis(); long t1 = EnvironmentEdgeManager.currentTimeMillis();
if ((t1 - last_report_at) > period) { if ((t1 - last_report_at) > period) {
last_report_at = t; last_report_at = t;
if (reporter != null && reporter.progress() == false) { if (reporter != null && reporter.progress() == false) {
status.markComplete("Failed: reporter.progress asked us to terminate");
progress_failed = true; progress_failed = true;
return false; return false;
} }
@ -416,10 +457,12 @@ public class HLogSplitter {
wap.w.close(); wap.w.close();
LOG.debug("Closed " + wap.p); LOG.debug("Closed " + wap.p);
} }
LOG.info("processed " + editsCount + " edits across " + n + " regions" + String msg = ("processed " + editsCount + " edits across " + n + " regions" +
" threw away edits for " + (logWriters.size() - n) + " regions" + " threw away edits for " + (logWriters.size() - n) + " regions" +
" log file = " + logPath + " log file = " + logPath +
" is corrupted = " + isCorrupted); " is corrupted = " + isCorrupted);
LOG.info(msg);
status.markComplete(msg);
} }
return true; return true;
} }

View File

@ -33,6 +33,7 @@ import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.Server; import org.apache.hadoop.hbase.Server;
import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.catalog.CatalogTracker; import org.apache.hadoop.hbase.catalog.CatalogTracker;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.zookeeper.ZKUtil; import org.apache.hadoop.hbase.zookeeper.ZKUtil;
import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener; import org.apache.hadoop.hbase.zookeeper.ZooKeeperListener;
@ -41,6 +42,7 @@ import org.apache.zookeeper.KeeperException;
import org.junit.AfterClass; import org.junit.AfterClass;
import org.junit.BeforeClass; import org.junit.BeforeClass;
import org.junit.Test; import org.junit.Test;
import org.mockito.Mockito;
/** /**
* Test the {@link ActiveMasterManager}. * Test the {@link ActiveMasterManager}.
@ -77,7 +79,8 @@ public class TestActiveMasterManager {
assertFalse(activeMasterManager.clusterHasActiveMaster.get()); assertFalse(activeMasterManager.clusterHasActiveMaster.get());
// First test becoming the active master uninterrupted // First test becoming the active master uninterrupted
activeMasterManager.blockUntilBecomingActiveMaster(); MonitoredTask status = Mockito.mock(MonitoredTask.class);
activeMasterManager.blockUntilBecomingActiveMaster(status);
assertTrue(activeMasterManager.clusterHasActiveMaster.get()); assertTrue(activeMasterManager.clusterHasActiveMaster.get());
assertMaster(zk, master); assertMaster(zk, master);
@ -87,7 +90,7 @@ public class TestActiveMasterManager {
master, secondDummyMaster); master, secondDummyMaster);
zk.registerListener(secondActiveMasterManager); zk.registerListener(secondActiveMasterManager);
assertFalse(secondActiveMasterManager.clusterHasActiveMaster.get()); assertFalse(secondActiveMasterManager.clusterHasActiveMaster.get());
activeMasterManager.blockUntilBecomingActiveMaster(); activeMasterManager.blockUntilBecomingActiveMaster(status);
assertTrue(activeMasterManager.clusterHasActiveMaster.get()); assertTrue(activeMasterManager.clusterHasActiveMaster.get());
assertMaster(zk, master); assertMaster(zk, master);
} }
@ -120,7 +123,8 @@ public class TestActiveMasterManager {
assertFalse(activeMasterManager.clusterHasActiveMaster.get()); assertFalse(activeMasterManager.clusterHasActiveMaster.get());
// First test becoming the active master uninterrupted // First test becoming the active master uninterrupted
activeMasterManager.blockUntilBecomingActiveMaster(); activeMasterManager.blockUntilBecomingActiveMaster(
Mockito.mock(MonitoredTask.class));
assertTrue(activeMasterManager.clusterHasActiveMaster.get()); assertTrue(activeMasterManager.clusterHasActiveMaster.get());
assertMaster(zk, firstMasterAddress); assertMaster(zk, firstMasterAddress);
@ -201,7 +205,8 @@ public class TestActiveMasterManager {
@Override @Override
public void run() { public void run() {
manager.blockUntilBecomingActiveMaster(); manager.blockUntilBecomingActiveMaster(
Mockito.mock(MonitoredTask.class));
LOG.info("Second master has become the active master!"); LOG.info("Second master has become the active master!");
isActiveMaster = true; isActiveMaster = true;
} }

View File

@ -0,0 +1,101 @@
/**
* Copyright 2011 The Apache Software Foundation
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.monitoring;
import static org.junit.Assert.*;
import java.util.concurrent.atomic.AtomicBoolean;
import org.junit.Test;
public class TestTaskMonitor {
@Test
public void testTaskMonitorBasics() {
TaskMonitor tm = new TaskMonitor();
assertTrue("Task monitor should start empty",
tm.getTasks().isEmpty());
// Make a task and fetch it back out
MonitoredTask task = tm.createStatus("Test task");
MonitoredTask taskFromTm = tm.getTasks().get(0);
// Make sure the state is reasonable.
assertEquals(task.getDescription(), taskFromTm.getDescription());
assertEquals(-1, taskFromTm.getCompletionTimestamp());
assertEquals(MonitoredTask.State.RUNNING, taskFromTm.getState());
// Mark it as finished
task.markComplete("Finished!");
assertEquals(MonitoredTask.State.COMPLETE, taskFromTm.getState());
// It should still show up in the TaskMonitor list
assertEquals(1, tm.getTasks().size());
// If we mark its completion time back a few minutes, it should get gced
((MonitoredTaskImpl)taskFromTm).expireNow();
assertEquals(0, tm.getTasks().size());
}
@Test
public void testTasksGetAbortedOnLeak() throws InterruptedException {
final TaskMonitor tm = new TaskMonitor();
assertTrue("Task monitor should start empty",
tm.getTasks().isEmpty());
final AtomicBoolean threadSuccess = new AtomicBoolean(false);
// Make a task in some other thread and leak it
Thread t = new Thread() {
@Override
public void run() {
MonitoredTask task = tm.createStatus("Test task");
assertEquals(MonitoredTask.State.RUNNING, task.getState());
threadSuccess.set(true);
}
};
t.start();
t.join();
// Make sure the thread saw the correct state
assertTrue(threadSuccess.get());
// Make sure the leaked reference gets cleared
System.gc();
System.gc();
System.gc();
// Now it should be aborted
MonitoredTask taskFromTm = tm.getTasks().get(0);
assertEquals(MonitoredTask.State.ABORTED, taskFromTm.getState());
}
@Test
public void testTaskLimit() throws Exception {
TaskMonitor tm = new TaskMonitor();
for (int i = 0; i < TaskMonitor.MAX_TASKS + 10; i++) {
tm.createStatus("task " + i);
}
// Make sure it was limited correctly
assertEquals(TaskMonitor.MAX_TASKS, tm.getTasks().size());
// Make sure we culled the earlier tasks, not later
// (i.e. tasks 0 through 9 should have been deleted)
assertEquals("task 10", tm.getTasks().get(0).getDescription());
}
}

View File

@ -50,6 +50,7 @@ import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.KeyValue; import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.client.Get; import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.regionserver.wal.HLog; import org.apache.hadoop.hbase.regionserver.wal.HLog;
import org.apache.hadoop.hbase.security.User; import org.apache.hadoop.hbase.security.User;
import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Bytes;
@ -57,6 +58,7 @@ import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper; import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper;
import org.apache.hadoop.hbase.util.IncrementingEnvironmentEdge; import org.apache.hadoop.hbase.util.IncrementingEnvironmentEdge;
import org.apache.hadoop.hbase.util.ManualEnvironmentEdge; import org.apache.hadoop.hbase.util.ManualEnvironmentEdge;
import org.mockito.Mockito;
import com.google.common.base.Joiner; import com.google.common.base.Joiner;
@ -600,7 +602,7 @@ public class TestStore extends TestCase {
private static void flushStore(Store store, long id) throws IOException { private static void flushStore(Store store, long id) throws IOException {
StoreFlusher storeFlusher = store.getStoreFlusher(id); StoreFlusher storeFlusher = store.getStoreFlusher(id);
storeFlusher.prepare(); storeFlusher.prepare();
storeFlusher.flushCache(); storeFlusher.flushCache(Mockito.mock(MonitoredTask.class));
storeFlusher.commit(); storeFlusher.commit();
} }

View File

@ -43,6 +43,7 @@ import org.apache.hadoop.hbase.client.Get;
import org.apache.hadoop.hbase.client.Put; import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.io.hfile.HFile; import org.apache.hadoop.hbase.io.hfile.HFile;
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
import org.apache.hadoop.hbase.regionserver.FlushRequester; import org.apache.hadoop.hbase.regionserver.FlushRequester;
import org.apache.hadoop.hbase.regionserver.HRegion; import org.apache.hadoop.hbase.regionserver.HRegion;
import org.apache.hadoop.hbase.regionserver.Store; import org.apache.hadoop.hbase.regionserver.Store;
@ -55,6 +56,7 @@ import org.junit.AfterClass;
import org.junit.Before; import org.junit.Before;
import org.junit.BeforeClass; import org.junit.BeforeClass;
import org.junit.Test; import org.junit.Test;
import org.mockito.Mockito;
/** /**
* Test replay of edits out of a WAL split. * Test replay of edits out of a WAL split.
@ -394,7 +396,8 @@ public class TestWALReplay {
null) { null) {
protected boolean internalFlushcache(HLog wal, long myseqid) protected boolean internalFlushcache(HLog wal, long myseqid)
throws IOException { throws IOException {
boolean b = super.internalFlushcache(wal, myseqid); boolean b = super.internalFlushcache(wal, myseqid,
Mockito.mock(MonitoredTask.class));
flushcount.incrementAndGet(); flushcount.incrementAndGet();
return b; return b;
}; };