HBASE-1099 Regions assigned while master is splitting logs of recently crashed server; regionserver tries to execute incomplete log

git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@732491 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Michael Stack 2009-01-07 21:03:20 +00:00
parent 18815c8879
commit 079c580042
8 changed files with 160 additions and 86 deletions

View File

@ -131,6 +131,8 @@ Release 0.19.0 - Unreleased
HBASE-1083 Will keep scheduling major compactions if last time one ran, we
didn't.
HBASE-1101 NPE in HConnectionManager$TableServers.processBatchOfRows
HBASE-1099 Regions assigned while master is splitting logs of recently
crashed server; regionserver tries to execute incomplete log
IMPROVEMENTS
HBASE-901 Add a limit to key length, check key and value length on client side

View File

@ -55,7 +55,6 @@ public class Leases extends Thread {
private final int leasePeriod;
private final int leaseCheckFrequency;
private volatile DelayQueue<Lease> leaseQueue = new DelayQueue<Lease>();
protected final Map<String, Lease> leases = new HashMap<String, Lease>();
private volatile boolean stopRequested = false;
@ -88,16 +87,17 @@ public class Leases extends Thread {
if (lease == null) {
continue;
}
// A lease expired
synchronized (leaseQueue) {
leases.remove(lease.getLeaseName());
// A lease expired. Run the expired code before removing from queue
// since its presence in queue is used to see if lease exists still.
if (lease.getListener() == null) {
LOG.error("lease listener is null for lease " + lease.getLeaseName());
continue;
}
}
} else {
lease.getListener().leaseExpired();
}
synchronized (leaseQueue) {
leases.remove(lease.getLeaseName());
}
}
close();
}

View File

@ -338,17 +338,16 @@ abstract class BaseScanner extends Chore implements HConstants {
throws IOException {
synchronized (regionManager) {
// Skip region - if ...
if(info.isOffline() // offline
|| regionManager.isOfflined(info.getRegionName())) { // queued for offline
// Skip region - if
if(info.isOffline() ||
regionManager.isOfflined(info.getRegionName())) { // queued for offline
regionManager.removeRegion(info);
return;
}
HServerInfo storedInfo = null;
boolean deadServerAndLogsSplit = false;
boolean deadServer = false;
if (serverName.length() != 0) {
if (regionManager.isOfflined(info.getRegionName())) {
// Skip if region is on kill list
if(LOG.isDebugEnabled()) {
@ -357,31 +356,31 @@ abstract class BaseScanner extends Chore implements HConstants {
}
return;
}
storedInfo = master.serverManager.getServerInfo(serverName);
deadServer = master.serverManager.isDead(serverName);
storedInfo = this.master.serverManager.getServerInfo(serverName);
deadServer = this.master.serverManager.isDead(serverName);
deadServerAndLogsSplit =
this.master.serverManager.isDeadServerLogsSplit(serverName);
}
/*
* If the server is a dead server or its startcode is off -- either null
* If the server is a dead server and its logs have been split or its
* not on the dead server lists and its startcode is off -- either null
* or doesn't match the start code for the address -- then add it to the
* list of unassigned regions IF not already there (or pending open).
*/
if ((deadServer ||
(storedInfo == null || storedInfo.getStartCode() != startCode)) &&
(!regionManager.isUnassigned(info) &&
!regionManager.isPending(info.getRegionName()) &&
!regionManager.isAssigned(info.getRegionName()))) {
if ((deadServerAndLogsSplit ||
(!deadServer && (storedInfo == null ||
(storedInfo.getStartCode() != startCode)))) &&
this.regionManager.assignable(info)) {
// The current assignment is invalid
if (LOG.isDebugEnabled()) {
LOG.debug("Current assignment of " +
info.getRegionNameAsString() +
" is not valid." +
LOG.debug("Current assignment of " + info.getRegionNameAsString() +
" is not valid; deadServerAndLogsSplit=" + deadServerAndLogsSplit +
", deadServer=" + deadServer + ". " +
(storedInfo == null ? " Server '" + serverName + "' unknown." :
" serverInfo: " + storedInfo + ", passed startCode: " +
startCode + ", storedInfo.startCode: " + storedInfo.getStartCode()) +
startCode + ", storedInfo.startCode: " +
storedInfo.getStartCode()) +
" Region is not unassigned, assigned or pending");
}
@ -389,7 +388,6 @@ abstract class BaseScanner extends Chore implements HConstants {
// This is only done from here if we are restarting and there is stale
// data in the meta region. Once we are on-line, dead server log
// recovery is handled by lease expiration and ProcessServerShutdown
if (!regionManager.isInitialMetaScanComplete() &&
serverName.length() != 0) {
StringBuilder dirName = new StringBuilder("log_");

View File

@ -91,9 +91,7 @@ class ChangeTableState extends TableOperation {
synchronized (master.regionManager) {
if (online) {
// Bring offline regions on-line
if (!master.regionManager.isUnassigned(i) &&
!master.regionManager.isAssigned(i.getRegionName()) &&
!master.regionManager.isPending(i.getRegionName())) {
if (!master.regionManager.assignable(i)) {
master.regionManager.setUnassigned(i, false);
}
} else {

View File

@ -532,15 +532,14 @@ public class HMaster extends Thread implements HConstants, HMasterInterface,
/*
* HMasterRegionInterface
*/
public MapWritable regionServerStartup(HServerInfo serverInfo) {
// Set the address for now even tho it will not be persisted on
// the HRS side.
public MapWritable regionServerStartup(final HServerInfo serverInfo) {
// Set the address for now even tho it will not be persisted on HRS side.
String rsAddress = HBaseServer.getRemoteAddress();
serverInfo.setServerAddress(new HServerAddress
(rsAddress, serverInfo.getServerAddress().getPort()));
// register with server manager
serverManager.regionServerStartup(serverInfo);
// send back some config info
serverInfo.setServerAddress(new HServerAddress(rsAddress,
serverInfo.getServerAddress().getPort()));
// Register with server manager
this.serverManager.regionServerStartup(serverInfo);
// Send back some config info
return createConfigurationSubset();
}

View File

@ -45,11 +45,13 @@ import org.apache.hadoop.hbase.io.RowResult;
*/
class ProcessServerShutdown extends RegionServerOperation {
private final HServerAddress deadServer;
private final String deadServerName;
/*
* Cache of the server name.
*/
private final String deadServerStr;
private final boolean rootRegionServer;
private boolean rootRegionReassigned = false;
private Path oldLogDir;
private boolean logSplit;
private boolean rootRescanned;
@ -74,9 +76,8 @@ class ProcessServerShutdown extends RegionServerOperation {
boolean rootRegionServer) {
super(master);
this.deadServer = serverInfo.getServerAddress();
this.deadServerName = this.deadServer.toString();
this.deadServerStr = this.deadServer.toString();
this.rootRegionServer = rootRegionServer;
this.logSplit = false;
this.rootRescanned = false;
this.oldLogDir =
new Path(master.rootdir, HLog.getHLogDirectoryName(serverInfo));
@ -84,13 +85,14 @@ class ProcessServerShutdown extends RegionServerOperation {
@Override
public String toString() {
return "ProcessServerShutdown of " + this.deadServer.toString();
return "ProcessServerShutdown of " + this.deadServerStr;
}
/** Finds regions that the dead region server was serving */
/** Finds regions that the dead region server was serving
*/
protected void scanMetaRegion(HRegionInterface server, long scannerId,
byte [] regionName) throws IOException {
byte [] regionName)
throws IOException {
List<ToDoEntry> toDoList = new ArrayList<ToDoEntry>();
Set<HRegionInfo> regions = new HashSet<HRegionInfo>();
List<byte []> emptyRows = new ArrayList<byte []>();
@ -107,14 +109,13 @@ class ProcessServerShutdown extends RegionServerOperation {
if (values == null || values.size() == 0) {
break;
}
byte [] row = values.getRow();
// Check server name. If null, be conservative and treat as though
// region had been on shutdown server (could be null because we
// missed edits in hlog because hdfs does not do write-append).
// Check server name. If null, skip (We used to consider it was on
// shutdown server but that would mean that we'd reassign regions that
// were already out being assigned, ones that were product of a split
// that happened while the shutdown was being processed.
String serverName = Writables.cellToString(values.get(COL_SERVER));
if (serverName != null && serverName.length() > 0 &&
deadServerName.compareTo(serverName) != 0) {
if (serverName == null || !deadServerStr.equals(serverName)) {
// This isn't the server you're looking for - move along
continue;
}
@ -222,7 +223,6 @@ class ProcessServerShutdown extends RegionServerOperation {
long scannerId =
server.openScanner(m.getRegionName(), COLUMN_FAMILY_ARRAY,
EMPTY_START_ROW, HConstants.LATEST_TIMESTAMP, null);
scanMetaRegion(server, scannerId, m.getRegionName());
return true;
}
@ -230,13 +230,15 @@ class ProcessServerShutdown extends RegionServerOperation {
@Override
protected boolean process() throws IOException {
LOG.info("process shutdown of server " + deadServer + ": logSplit: " +
this.logSplit + ", rootRescanned: " + rootRescanned +
boolean logSplit =
this.master.serverManager.isDeadServerLogsSplit(this.deadServerStr);
LOG.info("process shutdown of server " + this.deadServerStr +
": logSplit: " +
logSplit + ", rootRescanned: " + rootRescanned +
", numberOfMetaRegions: " +
master.regionManager.numMetaRegions() +
", onlineMetaRegions.size(): " +
master.regionManager.numOnlineMetaRegions());
if (!logSplit) {
// Process the old log file
if (master.fs.exists(oldLogDir)) {
@ -250,7 +252,7 @@ class ProcessServerShutdown extends RegionServerOperation {
master.regionManager.splitLogLock.unlock();
}
}
logSplit = true;
this.master.serverManager.setDeadServerLogsSplit(this.deadServerStr);
}
if (this.rootRegionServer && !this.rootRegionReassigned) {
@ -277,7 +279,6 @@ class ProcessServerShutdown extends RegionServerOperation {
new MetaRegion(master.getRootRegionLocation(),
HRegionInfo.ROOT_REGIONINFO.getRegionName(),
HConstants.EMPTY_START_ROW), this.master).doWithRetries();
if (result == null) {
// Master is closing - give up
return true;
@ -290,7 +291,6 @@ class ProcessServerShutdown extends RegionServerOperation {
}
rootRescanned = true;
}
if (!metaTableAvailable()) {
// We can't proceed because not all meta regions are online.
// metaAvailable() has put this request on the delayedToDoQueue
@ -309,7 +309,11 @@ class ProcessServerShutdown extends RegionServerOperation {
Bytes.toString(r.getRegionName()) + " on " + r.getServer());
}
}
master.serverManager.removeDeadServer(deadServerName);
// Remove this server from dead servers list. Finished splitting logs.
this.master.serverManager.removeDeadServer(deadServerStr);
if (LOG.isDebugEnabled()) {
LOG.debug("Removed " + deadServerStr + " from deadservers Map");
}
return true;
}
}

View File

@ -695,6 +695,17 @@ class RegionManager implements HConstants {
return false;
}
/**
* @param hri
* @return True if the passed region is assignable: i.e. not assigned, not
* pending and not unassigned.
*/
public boolean assignable(final HRegionInfo hri) {
return !isUnassigned(hri) &&
!isPending(hri.getRegionName()) &&
!isAssigned(hri.getRegionName());
}
/**
* @param regionName
* @return true if region has been assigned

View File

@ -59,9 +59,13 @@ class ServerManager implements HConstants {
final Map<String, HServerInfo> serversToServerInfo =
new ConcurrentHashMap<String, HServerInfo>();
/** Set of known dead servers */
final Set<String> deadServers =
Collections.synchronizedSet(new HashSet<String>());
/**
* Set of known dead servers. On lease expiration, servers are added here.
* Boolean holds whether its logs have been split or not. Initially set to
* false.
*/
private final Map<String, Boolean> deadServers =
new ConcurrentHashMap<String, Boolean>();
/** SortedMap server load -> Set of server names */
final SortedMap<HServerLoad, Set<String>> loadToServers =
@ -90,23 +94,66 @@ class ServerManager implements HConstants {
getLong("hbase.master.avgload.logging.period", 60000);
}
/*
* Look to see if we have ghost references to this regionserver such as
* still-existing leases or if regionserver is on the dead servers list
* getting its logs processed.
* @param serverInfo
* @return True if still ghost references and we have not been able to clear
* them or the server is shutting down.
*/
private boolean checkForGhostReferences(final HServerInfo serverInfo) {
String s = serverInfo.getServerAddress().toString().trim();
boolean result = false;
boolean lease = false;
for (long sleepTime = -1; !master.closed.get() && !result;) {
if (sleepTime != -1) {
try {
Thread.sleep(sleepTime);
} catch (InterruptedException e) {
// Continue
}
}
if (!lease) {
try {
this.serverLeases.createLease(s, new ServerExpirer(s));
} catch (Leases.LeaseStillHeldException e) {
LOG.debug("Waiting on current lease to expire for " + e.getName());
sleepTime = this.master.leaseTimeout / 4;
continue;
}
lease = true;
}
// May be on list of dead servers. If so, wait till we've cleared it.
String addr = serverInfo.getServerAddress().toString();
if (isDead(addr) && !isDeadServerLogsSplit(addr)) {
LOG.debug("Waiting on " + addr + " removal from dead list before " +
"processing report-for-duty request");
sleepTime = this.master.threadWakeFrequency;
try {
// Keep up lease. May be here > lease expiration.
this.serverLeases.renewLease(s);
} catch (LeaseException e) {
LOG.warn("Failed renewal. Retrying.", e);
}
continue;
}
result = true;
}
return result;
}
/**
* Let the server manager know a new regionserver has come online
* @param serverInfo
*/
public void regionServerStartup(HServerInfo serverInfo) {
public void regionServerStartup(final HServerInfo serverInfo) {
String s = serverInfo.getServerAddress().toString().trim();
LOG.info("Received start message from: " + s);
// Do the lease check up here. There might already be one out on this
// server expecially if it just shutdown and came back up near-immediately.
if (!master.closed.get()) {
try {
serverLeases.createLease(s, new ServerExpirer(s));
} catch (Leases.LeaseStillHeldException e) {
LOG.debug("Lease still held on " + e.getName());
if (!checkForGhostReferences(serverInfo)) {
return;
}
}
// Go on to process the regionserver registration.
HServerLoad load = serversToLoad.remove(s);
if (load != null) {
// The startup message was from a known server.
@ -119,7 +166,6 @@ class ServerManager implements HConstants {
}
}
}
HServerInfo storedInfo = serversToServerInfo.remove(s);
if (storedInfo != null && !master.closed.get()) {
// The startup message was from a known server with the same name.
@ -137,7 +183,6 @@ class ServerManager implements HConstants {
LOG.error("Insertion into toDoQueue was interrupted", e);
}
}
// record new server
load = new HServerLoad();
serverInfo.setLoad(load);
@ -703,7 +748,7 @@ class ServerManager implements HConstants {
}
}
}
deadServers.add(server);
deadServers.put(server, Boolean.FALSE);
try {
master.toDoQueue.put(
new ProcessServerShutdown(master, info, rootServer));
@ -742,6 +787,23 @@ class ServerManager implements HConstants {
* @return true if server is dead
*/
public boolean isDead(String serverName) {
return deadServers.contains(serverName);
return deadServers.containsKey(serverName);
}
/**
* @param serverName
* @return True if this is a dead server and it has had its logs split.
*/
public boolean isDeadServerLogsSplit(final String serverName) {
Boolean b = this.deadServers.get(serverName);
return b == null? false: b.booleanValue();
}
/**
* Set that this deadserver has had its log split.
* @param serverName
*/
public void setDeadServerLogsSplit(final String serverName) {
this.deadServers.put(serverName, Boolean.TRUE);
}
}