HBASE-1099 Regions assigned while master is splitting logs of recently crashed server; regionserver tries to execute incomplete log
git-svn-id: https://svn.apache.org/repos/asf/hadoop/hbase/trunk@732491 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
18815c8879
commit
079c580042
|
@ -131,6 +131,8 @@ Release 0.19.0 - Unreleased
|
|||
HBASE-1083 Will keep scheduling major compactions if last time one ran, we
|
||||
didn't.
|
||||
HBASE-1101 NPE in HConnectionManager$TableServers.processBatchOfRows
|
||||
HBASE-1099 Regions assigned while master is splitting logs of recently
|
||||
crashed server; regionserver tries to execute incomplete log
|
||||
|
||||
IMPROVEMENTS
|
||||
HBASE-901 Add a limit to key length, check key and value length on client side
|
||||
|
|
|
@ -55,7 +55,6 @@ public class Leases extends Thread {
|
|||
private final int leasePeriod;
|
||||
private final int leaseCheckFrequency;
|
||||
private volatile DelayQueue<Lease> leaseQueue = new DelayQueue<Lease>();
|
||||
|
||||
protected final Map<String, Lease> leases = new HashMap<String, Lease>();
|
||||
private volatile boolean stopRequested = false;
|
||||
|
||||
|
@ -88,16 +87,17 @@ public class Leases extends Thread {
|
|||
if (lease == null) {
|
||||
continue;
|
||||
}
|
||||
// A lease expired
|
||||
synchronized (leaseQueue) {
|
||||
leases.remove(lease.getLeaseName());
|
||||
// A lease expired. Run the expired code before removing from queue
|
||||
// since its presence in queue is used to see if lease exists still.
|
||||
if (lease.getListener() == null) {
|
||||
LOG.error("lease listener is null for lease " + lease.getLeaseName());
|
||||
continue;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
lease.getListener().leaseExpired();
|
||||
}
|
||||
synchronized (leaseQueue) {
|
||||
leases.remove(lease.getLeaseName());
|
||||
}
|
||||
}
|
||||
close();
|
||||
}
|
||||
|
||||
|
|
|
@ -338,17 +338,16 @@ abstract class BaseScanner extends Chore implements HConstants {
|
|||
throws IOException {
|
||||
|
||||
synchronized (regionManager) {
|
||||
// Skip region - if ...
|
||||
if(info.isOffline() // offline
|
||||
|| regionManager.isOfflined(info.getRegionName())) { // queued for offline
|
||||
|
||||
// Skip region - if
|
||||
if(info.isOffline() ||
|
||||
regionManager.isOfflined(info.getRegionName())) { // queued for offline
|
||||
regionManager.removeRegion(info);
|
||||
return;
|
||||
}
|
||||
HServerInfo storedInfo = null;
|
||||
boolean deadServerAndLogsSplit = false;
|
||||
boolean deadServer = false;
|
||||
if (serverName.length() != 0) {
|
||||
|
||||
if (regionManager.isOfflined(info.getRegionName())) {
|
||||
// Skip if region is on kill list
|
||||
if(LOG.isDebugEnabled()) {
|
||||
|
@ -357,31 +356,31 @@ abstract class BaseScanner extends Chore implements HConstants {
|
|||
}
|
||||
return;
|
||||
}
|
||||
|
||||
storedInfo = master.serverManager.getServerInfo(serverName);
|
||||
deadServer = master.serverManager.isDead(serverName);
|
||||
storedInfo = this.master.serverManager.getServerInfo(serverName);
|
||||
deadServer = this.master.serverManager.isDead(serverName);
|
||||
deadServerAndLogsSplit =
|
||||
this.master.serverManager.isDeadServerLogsSplit(serverName);
|
||||
}
|
||||
|
||||
/*
|
||||
* If the server is a dead server or its startcode is off -- either null
|
||||
* If the server is a dead server and its logs have been split or its
|
||||
* not on the dead server lists and its startcode is off -- either null
|
||||
* or doesn't match the start code for the address -- then add it to the
|
||||
* list of unassigned regions IF not already there (or pending open).
|
||||
*/
|
||||
if ((deadServer ||
|
||||
(storedInfo == null || storedInfo.getStartCode() != startCode)) &&
|
||||
(!regionManager.isUnassigned(info) &&
|
||||
!regionManager.isPending(info.getRegionName()) &&
|
||||
!regionManager.isAssigned(info.getRegionName()))) {
|
||||
|
||||
if ((deadServerAndLogsSplit ||
|
||||
(!deadServer && (storedInfo == null ||
|
||||
(storedInfo.getStartCode() != startCode)))) &&
|
||||
this.regionManager.assignable(info)) {
|
||||
// The current assignment is invalid
|
||||
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("Current assignment of " +
|
||||
info.getRegionNameAsString() +
|
||||
" is not valid." +
|
||||
LOG.debug("Current assignment of " + info.getRegionNameAsString() +
|
||||
" is not valid; deadServerAndLogsSplit=" + deadServerAndLogsSplit +
|
||||
", deadServer=" + deadServer + ". " +
|
||||
(storedInfo == null ? " Server '" + serverName + "' unknown." :
|
||||
" serverInfo: " + storedInfo + ", passed startCode: " +
|
||||
startCode + ", storedInfo.startCode: " + storedInfo.getStartCode()) +
|
||||
startCode + ", storedInfo.startCode: " +
|
||||
storedInfo.getStartCode()) +
|
||||
" Region is not unassigned, assigned or pending");
|
||||
}
|
||||
|
||||
|
@ -389,7 +388,6 @@ abstract class BaseScanner extends Chore implements HConstants {
|
|||
// This is only done from here if we are restarting and there is stale
|
||||
// data in the meta region. Once we are on-line, dead server log
|
||||
// recovery is handled by lease expiration and ProcessServerShutdown
|
||||
|
||||
if (!regionManager.isInitialMetaScanComplete() &&
|
||||
serverName.length() != 0) {
|
||||
StringBuilder dirName = new StringBuilder("log_");
|
||||
|
|
|
@ -91,9 +91,7 @@ class ChangeTableState extends TableOperation {
|
|||
synchronized (master.regionManager) {
|
||||
if (online) {
|
||||
// Bring offline regions on-line
|
||||
if (!master.regionManager.isUnassigned(i) &&
|
||||
!master.regionManager.isAssigned(i.getRegionName()) &&
|
||||
!master.regionManager.isPending(i.getRegionName())) {
|
||||
if (!master.regionManager.assignable(i)) {
|
||||
master.regionManager.setUnassigned(i, false);
|
||||
}
|
||||
} else {
|
||||
|
|
|
@ -532,15 +532,14 @@ public class HMaster extends Thread implements HConstants, HMasterInterface,
|
|||
/*
|
||||
* HMasterRegionInterface
|
||||
*/
|
||||
public MapWritable regionServerStartup(HServerInfo serverInfo) {
|
||||
// Set the address for now even tho it will not be persisted on
|
||||
// the HRS side.
|
||||
public MapWritable regionServerStartup(final HServerInfo serverInfo) {
|
||||
// Set the address for now even tho it will not be persisted on HRS side.
|
||||
String rsAddress = HBaseServer.getRemoteAddress();
|
||||
serverInfo.setServerAddress(new HServerAddress
|
||||
(rsAddress, serverInfo.getServerAddress().getPort()));
|
||||
// register with server manager
|
||||
serverManager.regionServerStartup(serverInfo);
|
||||
// send back some config info
|
||||
serverInfo.setServerAddress(new HServerAddress(rsAddress,
|
||||
serverInfo.getServerAddress().getPort()));
|
||||
// Register with server manager
|
||||
this.serverManager.regionServerStartup(serverInfo);
|
||||
// Send back some config info
|
||||
return createConfigurationSubset();
|
||||
}
|
||||
|
||||
|
|
|
@ -45,11 +45,13 @@ import org.apache.hadoop.hbase.io.RowResult;
|
|||
*/
|
||||
class ProcessServerShutdown extends RegionServerOperation {
|
||||
private final HServerAddress deadServer;
|
||||
private final String deadServerName;
|
||||
/*
|
||||
* Cache of the server name.
|
||||
*/
|
||||
private final String deadServerStr;
|
||||
private final boolean rootRegionServer;
|
||||
private boolean rootRegionReassigned = false;
|
||||
private Path oldLogDir;
|
||||
private boolean logSplit;
|
||||
private boolean rootRescanned;
|
||||
|
||||
|
||||
|
@ -74,9 +76,8 @@ class ProcessServerShutdown extends RegionServerOperation {
|
|||
boolean rootRegionServer) {
|
||||
super(master);
|
||||
this.deadServer = serverInfo.getServerAddress();
|
||||
this.deadServerName = this.deadServer.toString();
|
||||
this.deadServerStr = this.deadServer.toString();
|
||||
this.rootRegionServer = rootRegionServer;
|
||||
this.logSplit = false;
|
||||
this.rootRescanned = false;
|
||||
this.oldLogDir =
|
||||
new Path(master.rootdir, HLog.getHLogDirectoryName(serverInfo));
|
||||
|
@ -84,13 +85,14 @@ class ProcessServerShutdown extends RegionServerOperation {
|
|||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "ProcessServerShutdown of " + this.deadServer.toString();
|
||||
return "ProcessServerShutdown of " + this.deadServerStr;
|
||||
}
|
||||
|
||||
/** Finds regions that the dead region server was serving */
|
||||
/** Finds regions that the dead region server was serving
|
||||
*/
|
||||
protected void scanMetaRegion(HRegionInterface server, long scannerId,
|
||||
byte [] regionName) throws IOException {
|
||||
|
||||
byte [] regionName)
|
||||
throws IOException {
|
||||
List<ToDoEntry> toDoList = new ArrayList<ToDoEntry>();
|
||||
Set<HRegionInfo> regions = new HashSet<HRegionInfo>();
|
||||
List<byte []> emptyRows = new ArrayList<byte []>();
|
||||
|
@ -107,14 +109,13 @@ class ProcessServerShutdown extends RegionServerOperation {
|
|||
if (values == null || values.size() == 0) {
|
||||
break;
|
||||
}
|
||||
|
||||
byte [] row = values.getRow();
|
||||
// Check server name. If null, be conservative and treat as though
|
||||
// region had been on shutdown server (could be null because we
|
||||
// missed edits in hlog because hdfs does not do write-append).
|
||||
// Check server name. If null, skip (We used to consider it was on
|
||||
// shutdown server but that would mean that we'd reassign regions that
|
||||
// were already out being assigned, ones that were product of a split
|
||||
// that happened while the shutdown was being processed.
|
||||
String serverName = Writables.cellToString(values.get(COL_SERVER));
|
||||
if (serverName != null && serverName.length() > 0 &&
|
||||
deadServerName.compareTo(serverName) != 0) {
|
||||
if (serverName == null || !deadServerStr.equals(serverName)) {
|
||||
// This isn't the server you're looking for - move along
|
||||
continue;
|
||||
}
|
||||
|
@ -222,7 +223,6 @@ class ProcessServerShutdown extends RegionServerOperation {
|
|||
long scannerId =
|
||||
server.openScanner(m.getRegionName(), COLUMN_FAMILY_ARRAY,
|
||||
EMPTY_START_ROW, HConstants.LATEST_TIMESTAMP, null);
|
||||
|
||||
scanMetaRegion(server, scannerId, m.getRegionName());
|
||||
return true;
|
||||
}
|
||||
|
@ -230,13 +230,15 @@ class ProcessServerShutdown extends RegionServerOperation {
|
|||
|
||||
@Override
|
||||
protected boolean process() throws IOException {
|
||||
LOG.info("process shutdown of server " + deadServer + ": logSplit: " +
|
||||
this.logSplit + ", rootRescanned: " + rootRescanned +
|
||||
boolean logSplit =
|
||||
this.master.serverManager.isDeadServerLogsSplit(this.deadServerStr);
|
||||
LOG.info("process shutdown of server " + this.deadServerStr +
|
||||
": logSplit: " +
|
||||
logSplit + ", rootRescanned: " + rootRescanned +
|
||||
", numberOfMetaRegions: " +
|
||||
master.regionManager.numMetaRegions() +
|
||||
", onlineMetaRegions.size(): " +
|
||||
master.regionManager.numOnlineMetaRegions());
|
||||
|
||||
if (!logSplit) {
|
||||
// Process the old log file
|
||||
if (master.fs.exists(oldLogDir)) {
|
||||
|
@ -250,7 +252,7 @@ class ProcessServerShutdown extends RegionServerOperation {
|
|||
master.regionManager.splitLogLock.unlock();
|
||||
}
|
||||
}
|
||||
logSplit = true;
|
||||
this.master.serverManager.setDeadServerLogsSplit(this.deadServerStr);
|
||||
}
|
||||
|
||||
if (this.rootRegionServer && !this.rootRegionReassigned) {
|
||||
|
@ -277,7 +279,6 @@ class ProcessServerShutdown extends RegionServerOperation {
|
|||
new MetaRegion(master.getRootRegionLocation(),
|
||||
HRegionInfo.ROOT_REGIONINFO.getRegionName(),
|
||||
HConstants.EMPTY_START_ROW), this.master).doWithRetries();
|
||||
|
||||
if (result == null) {
|
||||
// Master is closing - give up
|
||||
return true;
|
||||
|
@ -290,7 +291,6 @@ class ProcessServerShutdown extends RegionServerOperation {
|
|||
}
|
||||
rootRescanned = true;
|
||||
}
|
||||
|
||||
if (!metaTableAvailable()) {
|
||||
// We can't proceed because not all meta regions are online.
|
||||
// metaAvailable() has put this request on the delayedToDoQueue
|
||||
|
@ -309,7 +309,11 @@ class ProcessServerShutdown extends RegionServerOperation {
|
|||
Bytes.toString(r.getRegionName()) + " on " + r.getServer());
|
||||
}
|
||||
}
|
||||
master.serverManager.removeDeadServer(deadServerName);
|
||||
// Remove this server from dead servers list. Finished splitting logs.
|
||||
this.master.serverManager.removeDeadServer(deadServerStr);
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("Removed " + deadServerStr + " from deadservers Map");
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
|
@ -695,6 +695,17 @@ class RegionManager implements HConstants {
|
|||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* @param hri
|
||||
* @return True if the passed region is assignable: i.e. not assigned, not
|
||||
* pending and not unassigned.
|
||||
*/
|
||||
public boolean assignable(final HRegionInfo hri) {
|
||||
return !isUnassigned(hri) &&
|
||||
!isPending(hri.getRegionName()) &&
|
||||
!isAssigned(hri.getRegionName());
|
||||
}
|
||||
|
||||
/**
|
||||
* @param regionName
|
||||
* @return true if region has been assigned
|
||||
|
|
|
@ -59,9 +59,13 @@ class ServerManager implements HConstants {
|
|||
final Map<String, HServerInfo> serversToServerInfo =
|
||||
new ConcurrentHashMap<String, HServerInfo>();
|
||||
|
||||
/** Set of known dead servers */
|
||||
final Set<String> deadServers =
|
||||
Collections.synchronizedSet(new HashSet<String>());
|
||||
/**
|
||||
* Set of known dead servers. On lease expiration, servers are added here.
|
||||
* Boolean holds whether its logs have been split or not. Initially set to
|
||||
* false.
|
||||
*/
|
||||
private final Map<String, Boolean> deadServers =
|
||||
new ConcurrentHashMap<String, Boolean>();
|
||||
|
||||
/** SortedMap server load -> Set of server names */
|
||||
final SortedMap<HServerLoad, Set<String>> loadToServers =
|
||||
|
@ -90,23 +94,66 @@ class ServerManager implements HConstants {
|
|||
getLong("hbase.master.avgload.logging.period", 60000);
|
||||
}
|
||||
|
||||
/*
|
||||
* Look to see if we have ghost references to this regionserver such as
|
||||
* still-existing leases or if regionserver is on the dead servers list
|
||||
* getting its logs processed.
|
||||
* @param serverInfo
|
||||
* @return True if still ghost references and we have not been able to clear
|
||||
* them or the server is shutting down.
|
||||
*/
|
||||
private boolean checkForGhostReferences(final HServerInfo serverInfo) {
|
||||
String s = serverInfo.getServerAddress().toString().trim();
|
||||
boolean result = false;
|
||||
boolean lease = false;
|
||||
for (long sleepTime = -1; !master.closed.get() && !result;) {
|
||||
if (sleepTime != -1) {
|
||||
try {
|
||||
Thread.sleep(sleepTime);
|
||||
} catch (InterruptedException e) {
|
||||
// Continue
|
||||
}
|
||||
}
|
||||
if (!lease) {
|
||||
try {
|
||||
this.serverLeases.createLease(s, new ServerExpirer(s));
|
||||
} catch (Leases.LeaseStillHeldException e) {
|
||||
LOG.debug("Waiting on current lease to expire for " + e.getName());
|
||||
sleepTime = this.master.leaseTimeout / 4;
|
||||
continue;
|
||||
}
|
||||
lease = true;
|
||||
}
|
||||
// May be on list of dead servers. If so, wait till we've cleared it.
|
||||
String addr = serverInfo.getServerAddress().toString();
|
||||
if (isDead(addr) && !isDeadServerLogsSplit(addr)) {
|
||||
LOG.debug("Waiting on " + addr + " removal from dead list before " +
|
||||
"processing report-for-duty request");
|
||||
sleepTime = this.master.threadWakeFrequency;
|
||||
try {
|
||||
// Keep up lease. May be here > lease expiration.
|
||||
this.serverLeases.renewLease(s);
|
||||
} catch (LeaseException e) {
|
||||
LOG.warn("Failed renewal. Retrying.", e);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
result = true;
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Let the server manager know a new regionserver has come online
|
||||
* @param serverInfo
|
||||
*/
|
||||
public void regionServerStartup(HServerInfo serverInfo) {
|
||||
public void regionServerStartup(final HServerInfo serverInfo) {
|
||||
String s = serverInfo.getServerAddress().toString().trim();
|
||||
LOG.info("Received start message from: " + s);
|
||||
// Do the lease check up here. There might already be one out on this
|
||||
// server expecially if it just shutdown and came back up near-immediately.
|
||||
if (!master.closed.get()) {
|
||||
try {
|
||||
serverLeases.createLease(s, new ServerExpirer(s));
|
||||
} catch (Leases.LeaseStillHeldException e) {
|
||||
LOG.debug("Lease still held on " + e.getName());
|
||||
if (!checkForGhostReferences(serverInfo)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
// Go on to process the regionserver registration.
|
||||
HServerLoad load = serversToLoad.remove(s);
|
||||
if (load != null) {
|
||||
// The startup message was from a known server.
|
||||
|
@ -119,7 +166,6 @@ class ServerManager implements HConstants {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
HServerInfo storedInfo = serversToServerInfo.remove(s);
|
||||
if (storedInfo != null && !master.closed.get()) {
|
||||
// The startup message was from a known server with the same name.
|
||||
|
@ -137,7 +183,6 @@ class ServerManager implements HConstants {
|
|||
LOG.error("Insertion into toDoQueue was interrupted", e);
|
||||
}
|
||||
}
|
||||
|
||||
// record new server
|
||||
load = new HServerLoad();
|
||||
serverInfo.setLoad(load);
|
||||
|
@ -703,7 +748,7 @@ class ServerManager implements HConstants {
|
|||
}
|
||||
}
|
||||
}
|
||||
deadServers.add(server);
|
||||
deadServers.put(server, Boolean.FALSE);
|
||||
try {
|
||||
master.toDoQueue.put(
|
||||
new ProcessServerShutdown(master, info, rootServer));
|
||||
|
@ -742,6 +787,23 @@ class ServerManager implements HConstants {
|
|||
* @return true if server is dead
|
||||
*/
|
||||
public boolean isDead(String serverName) {
|
||||
return deadServers.contains(serverName);
|
||||
return deadServers.containsKey(serverName);
|
||||
}
|
||||
|
||||
/**
|
||||
* @param serverName
|
||||
* @return True if this is a dead server and it has had its logs split.
|
||||
*/
|
||||
public boolean isDeadServerLogsSplit(final String serverName) {
|
||||
Boolean b = this.deadServers.get(serverName);
|
||||
return b == null? false: b.booleanValue();
|
||||
}
|
||||
|
||||
/**
|
||||
* Set that this deadserver has had its log split.
|
||||
* @param serverName
|
||||
*/
|
||||
public void setDeadServerLogsSplit(final String serverName) {
|
||||
this.deadServers.put(serverName, Boolean.TRUE);
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue