diff --git a/CHANGES.txt b/CHANGES.txt index f089c59856c..050c92ce1bb 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -131,6 +131,8 @@ Release 0.19.0 - Unreleased HBASE-1083 Will keep scheduling major compactions if last time one ran, we didn't. HBASE-1101 NPE in HConnectionManager$TableServers.processBatchOfRows + HBASE-1099 Regions assigned while master is splitting logs of recently + crashed server; regionserver tries to execute incomplete log IMPROVEMENTS HBASE-901 Add a limit to key length, check key and value length on client side diff --git a/src/java/org/apache/hadoop/hbase/Leases.java b/src/java/org/apache/hadoop/hbase/Leases.java index c490f3b1876..2696ad16d4c 100644 --- a/src/java/org/apache/hadoop/hbase/Leases.java +++ b/src/java/org/apache/hadoop/hbase/Leases.java @@ -55,7 +55,6 @@ public class Leases extends Thread { private final int leasePeriod; private final int leaseCheckFrequency; private volatile DelayQueue leaseQueue = new DelayQueue(); - protected final Map leases = new HashMap(); private volatile boolean stopRequested = false; @@ -88,15 +87,16 @@ public class Leases extends Thread { if (lease == null) { continue; } - // A lease expired + // A lease expired. Run the expired code before removing from queue + // since its presence in queue is used to see if lease exists still. + if (lease.getListener() == null) { + LOG.error("lease listener is null for lease " + lease.getLeaseName()); + } else { + lease.getListener().leaseExpired(); + } synchronized (leaseQueue) { leases.remove(lease.getLeaseName()); - if (lease.getListener() == null) { - LOG.error("lease listener is null for lease " + lease.getLeaseName()); - continue; - } } - lease.getListener().leaseExpired(); } close(); } diff --git a/src/java/org/apache/hadoop/hbase/master/BaseScanner.java b/src/java/org/apache/hadoop/hbase/master/BaseScanner.java index bcb375e78a8..b210824b39c 100644 --- a/src/java/org/apache/hadoop/hbase/master/BaseScanner.java +++ b/src/java/org/apache/hadoop/hbase/master/BaseScanner.java @@ -338,17 +338,16 @@ abstract class BaseScanner extends Chore implements HConstants { throws IOException { synchronized (regionManager) { - // Skip region - if ... - if(info.isOffline() // offline - || regionManager.isOfflined(info.getRegionName())) { // queued for offline - + // Skip region - if + if(info.isOffline() || + regionManager.isOfflined(info.getRegionName())) { // queued for offline regionManager.removeRegion(info); return; } HServerInfo storedInfo = null; + boolean deadServerAndLogsSplit = false; boolean deadServer = false; if (serverName.length() != 0) { - if (regionManager.isOfflined(info.getRegionName())) { // Skip if region is on kill list if(LOG.isDebugEnabled()) { @@ -357,31 +356,31 @@ abstract class BaseScanner extends Chore implements HConstants { } return; } - - storedInfo = master.serverManager.getServerInfo(serverName); - deadServer = master.serverManager.isDead(serverName); + storedInfo = this.master.serverManager.getServerInfo(serverName); + deadServer = this.master.serverManager.isDead(serverName); + deadServerAndLogsSplit = + this.master.serverManager.isDeadServerLogsSplit(serverName); } /* - * If the server is a dead server or its startcode is off -- either null + * If the server is a dead server and its logs have been split or its + * not on the dead server lists and its startcode is off -- either null * or doesn't match the start code for the address -- then add it to the * list of unassigned regions IF not already there (or pending open). */ - if ((deadServer || - (storedInfo == null || storedInfo.getStartCode() != startCode)) && - (!regionManager.isUnassigned(info) && - !regionManager.isPending(info.getRegionName()) && - !regionManager.isAssigned(info.getRegionName()))) { - + if ((deadServerAndLogsSplit || + (!deadServer && (storedInfo == null || + (storedInfo.getStartCode() != startCode)))) && + this.regionManager.assignable(info)) { // The current assignment is invalid - if (LOG.isDebugEnabled()) { - LOG.debug("Current assignment of " + - info.getRegionNameAsString() + - " is not valid." + - (storedInfo == null ? " Server '" + serverName + "' unknown." : + LOG.debug("Current assignment of " + info.getRegionNameAsString() + + " is not valid; deadServerAndLogsSplit=" + deadServerAndLogsSplit + + ", deadServer=" + deadServer + ". " + + (storedInfo == null ? " Server '" + serverName + "' unknown." : " serverInfo: " + storedInfo + ", passed startCode: " + - startCode + ", storedInfo.startCode: " + storedInfo.getStartCode()) + + startCode + ", storedInfo.startCode: " + + storedInfo.getStartCode()) + " Region is not unassigned, assigned or pending"); } @@ -389,7 +388,6 @@ abstract class BaseScanner extends Chore implements HConstants { // This is only done from here if we are restarting and there is stale // data in the meta region. Once we are on-line, dead server log // recovery is handled by lease expiration and ProcessServerShutdown - if (!regionManager.isInitialMetaScanComplete() && serverName.length() != 0) { StringBuilder dirName = new StringBuilder("log_"); @@ -418,7 +416,7 @@ abstract class BaseScanner extends Chore implements HConstants { } } } - + /** * Notify the thread to die at the end of its next run */ diff --git a/src/java/org/apache/hadoop/hbase/master/ChangeTableState.java b/src/java/org/apache/hadoop/hbase/master/ChangeTableState.java index 2cd1cb9881b..99b94704c3b 100644 --- a/src/java/org/apache/hadoop/hbase/master/ChangeTableState.java +++ b/src/java/org/apache/hadoop/hbase/master/ChangeTableState.java @@ -91,9 +91,7 @@ class ChangeTableState extends TableOperation { synchronized (master.regionManager) { if (online) { // Bring offline regions on-line - if (!master.regionManager.isUnassigned(i) && - !master.regionManager.isAssigned(i.getRegionName()) && - !master.regionManager.isPending(i.getRegionName())) { + if (!master.regionManager.assignable(i)) { master.regionManager.setUnassigned(i, false); } } else { diff --git a/src/java/org/apache/hadoop/hbase/master/HMaster.java b/src/java/org/apache/hadoop/hbase/master/HMaster.java index ee8df84291a..0ba4b545298 100644 --- a/src/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/src/java/org/apache/hadoop/hbase/master/HMaster.java @@ -532,15 +532,14 @@ public class HMaster extends Thread implements HConstants, HMasterInterface, /* * HMasterRegionInterface */ - public MapWritable regionServerStartup(HServerInfo serverInfo) { - // Set the address for now even tho it will not be persisted on - // the HRS side. + public MapWritable regionServerStartup(final HServerInfo serverInfo) { + // Set the address for now even tho it will not be persisted on HRS side. String rsAddress = HBaseServer.getRemoteAddress(); - serverInfo.setServerAddress(new HServerAddress - (rsAddress, serverInfo.getServerAddress().getPort())); - // register with server manager - serverManager.regionServerStartup(serverInfo); - // send back some config info + serverInfo.setServerAddress(new HServerAddress(rsAddress, + serverInfo.getServerAddress().getPort())); + // Register with server manager + this.serverManager.regionServerStartup(serverInfo); + // Send back some config info return createConfigurationSubset(); } diff --git a/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java b/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java index c811840098f..23899c50fb3 100644 --- a/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java +++ b/src/java/org/apache/hadoop/hbase/master/ProcessServerShutdown.java @@ -45,11 +45,13 @@ import org.apache.hadoop.hbase.io.RowResult; */ class ProcessServerShutdown extends RegionServerOperation { private final HServerAddress deadServer; - private final String deadServerName; + /* + * Cache of the server name. + */ + private final String deadServerStr; private final boolean rootRegionServer; private boolean rootRegionReassigned = false; private Path oldLogDir; - private boolean logSplit; private boolean rootRescanned; @@ -74,9 +76,8 @@ class ProcessServerShutdown extends RegionServerOperation { boolean rootRegionServer) { super(master); this.deadServer = serverInfo.getServerAddress(); - this.deadServerName = this.deadServer.toString(); + this.deadServerStr = this.deadServer.toString(); this.rootRegionServer = rootRegionServer; - this.logSplit = false; this.rootRescanned = false; this.oldLogDir = new Path(master.rootdir, HLog.getHLogDirectoryName(serverInfo)); @@ -84,13 +85,14 @@ class ProcessServerShutdown extends RegionServerOperation { @Override public String toString() { - return "ProcessServerShutdown of " + this.deadServer.toString(); + return "ProcessServerShutdown of " + this.deadServerStr; } - /** Finds regions that the dead region server was serving */ + /** Finds regions that the dead region server was serving + */ protected void scanMetaRegion(HRegionInterface server, long scannerId, - byte [] regionName) throws IOException { - + byte [] regionName) + throws IOException { List toDoList = new ArrayList(); Set regions = new HashSet(); List emptyRows = new ArrayList(); @@ -107,14 +109,13 @@ class ProcessServerShutdown extends RegionServerOperation { if (values == null || values.size() == 0) { break; } - byte [] row = values.getRow(); - // Check server name. If null, be conservative and treat as though - // region had been on shutdown server (could be null because we - // missed edits in hlog because hdfs does not do write-append). + // Check server name. If null, skip (We used to consider it was on + // shutdown server but that would mean that we'd reassign regions that + // were already out being assigned, ones that were product of a split + // that happened while the shutdown was being processed. String serverName = Writables.cellToString(values.get(COL_SERVER)); - if (serverName != null && serverName.length() > 0 && - deadServerName.compareTo(serverName) != 0) { + if (serverName == null || !deadServerStr.equals(serverName)) { // This isn't the server you're looking for - move along continue; } @@ -159,7 +160,7 @@ class ProcessServerShutdown extends RegionServerOperation { } } } finally { - if(scannerId != -1L) { + if (scannerId != -1L) { try { server.close(scannerId); } catch (IOException e) { @@ -222,21 +223,22 @@ class ProcessServerShutdown extends RegionServerOperation { long scannerId = server.openScanner(m.getRegionName(), COLUMN_FAMILY_ARRAY, EMPTY_START_ROW, HConstants.LATEST_TIMESTAMP, null); - - scanMetaRegion(server, scannerId, m.getRegionName()); + scanMetaRegion(server, scannerId, m.getRegionName()); return true; } } @Override protected boolean process() throws IOException { - LOG.info("process shutdown of server " + deadServer + ": logSplit: " + - this.logSplit + ", rootRescanned: " + rootRescanned + + boolean logSplit = + this.master.serverManager.isDeadServerLogsSplit(this.deadServerStr); + LOG.info("process shutdown of server " + this.deadServerStr + + ": logSplit: " + + logSplit + ", rootRescanned: " + rootRescanned + ", numberOfMetaRegions: " + master.regionManager.numMetaRegions() + ", onlineMetaRegions.size(): " + master.regionManager.numOnlineMetaRegions()); - if (!logSplit) { // Process the old log file if (master.fs.exists(oldLogDir)) { @@ -250,9 +252,9 @@ class ProcessServerShutdown extends RegionServerOperation { master.regionManager.splitLogLock.unlock(); } } - logSplit = true; + this.master.serverManager.setDeadServerLogsSplit(this.deadServerStr); } - + if (this.rootRegionServer && !this.rootRegionReassigned) { // avoid multiple root region reassignment this.rootRegionReassigned = true; @@ -277,7 +279,6 @@ class ProcessServerShutdown extends RegionServerOperation { new MetaRegion(master.getRootRegionLocation(), HRegionInfo.ROOT_REGIONINFO.getRegionName(), HConstants.EMPTY_START_ROW), this.master).doWithRetries(); - if (result == null) { // Master is closing - give up return true; @@ -290,7 +291,6 @@ class ProcessServerShutdown extends RegionServerOperation { } rootRescanned = true; } - if (!metaTableAvailable()) { // We can't proceed because not all meta regions are online. // metaAvailable() has put this request on the delayedToDoQueue @@ -309,7 +309,11 @@ class ProcessServerShutdown extends RegionServerOperation { Bytes.toString(r.getRegionName()) + " on " + r.getServer()); } } - master.serverManager.removeDeadServer(deadServerName); + // Remove this server from dead servers list. Finished splitting logs. + this.master.serverManager.removeDeadServer(deadServerStr); + if (LOG.isDebugEnabled()) { + LOG.debug("Removed " + deadServerStr + " from deadservers Map"); + } return true; } -} +} \ No newline at end of file diff --git a/src/java/org/apache/hadoop/hbase/master/RegionManager.java b/src/java/org/apache/hadoop/hbase/master/RegionManager.java index ed7006116fd..c3c76a240d8 100644 --- a/src/java/org/apache/hadoop/hbase/master/RegionManager.java +++ b/src/java/org/apache/hadoop/hbase/master/RegionManager.java @@ -694,6 +694,17 @@ class RegionManager implements HConstants { } return false; } + + /** + * @param hri + * @return True if the passed region is assignable: i.e. not assigned, not + * pending and not unassigned. + */ + public boolean assignable(final HRegionInfo hri) { + return !isUnassigned(hri) && + !isPending(hri.getRegionName()) && + !isAssigned(hri.getRegionName()); + } /** * @param regionName diff --git a/src/java/org/apache/hadoop/hbase/master/ServerManager.java b/src/java/org/apache/hadoop/hbase/master/ServerManager.java index 5aab717c39a..c161f95cdaf 100644 --- a/src/java/org/apache/hadoop/hbase/master/ServerManager.java +++ b/src/java/org/apache/hadoop/hbase/master/ServerManager.java @@ -59,9 +59,13 @@ class ServerManager implements HConstants { final Map serversToServerInfo = new ConcurrentHashMap(); - /** Set of known dead servers */ - final Set deadServers = - Collections.synchronizedSet(new HashSet()); + /** + * Set of known dead servers. On lease expiration, servers are added here. + * Boolean holds whether its logs have been split or not. Initially set to + * false. + */ + private final Map deadServers = + new ConcurrentHashMap(); /** SortedMap server load -> Set of server names */ final SortedMap> loadToServers = @@ -89,24 +93,67 @@ class ServerManager implements HConstants { this.loggingPeriodForAverageLoad = master.getConfiguration(). getLong("hbase.master.avgload.logging.period", 60000); } - + + /* + * Look to see if we have ghost references to this regionserver such as + * still-existing leases or if regionserver is on the dead servers list + * getting its logs processed. + * @param serverInfo + * @return True if still ghost references and we have not been able to clear + * them or the server is shutting down. + */ + private boolean checkForGhostReferences(final HServerInfo serverInfo) { + String s = serverInfo.getServerAddress().toString().trim(); + boolean result = false; + boolean lease = false; + for (long sleepTime = -1; !master.closed.get() && !result;) { + if (sleepTime != -1) { + try { + Thread.sleep(sleepTime); + } catch (InterruptedException e) { + // Continue + } + } + if (!lease) { + try { + this.serverLeases.createLease(s, new ServerExpirer(s)); + } catch (Leases.LeaseStillHeldException e) { + LOG.debug("Waiting on current lease to expire for " + e.getName()); + sleepTime = this.master.leaseTimeout / 4; + continue; + } + lease = true; + } + // May be on list of dead servers. If so, wait till we've cleared it. + String addr = serverInfo.getServerAddress().toString(); + if (isDead(addr) && !isDeadServerLogsSplit(addr)) { + LOG.debug("Waiting on " + addr + " removal from dead list before " + + "processing report-for-duty request"); + sleepTime = this.master.threadWakeFrequency; + try { + // Keep up lease. May be here > lease expiration. + this.serverLeases.renewLease(s); + } catch (LeaseException e) { + LOG.warn("Failed renewal. Retrying.", e); + } + continue; + } + result = true; + } + return result; + } + /** * Let the server manager know a new regionserver has come online * @param serverInfo */ - public void regionServerStartup(HServerInfo serverInfo) { + public void regionServerStartup(final HServerInfo serverInfo) { String s = serverInfo.getServerAddress().toString().trim(); LOG.info("Received start message from: " + s); - // Do the lease check up here. There might already be one out on this - // server expecially if it just shutdown and came back up near-immediately. - if (!master.closed.get()) { - try { - serverLeases.createLease(s, new ServerExpirer(s)); - } catch (Leases.LeaseStillHeldException e) { - LOG.debug("Lease still held on " + e.getName()); - return; - } + if (!checkForGhostReferences(serverInfo)) { + return; } + // Go on to process the regionserver registration. HServerLoad load = serversToLoad.remove(s); if (load != null) { // The startup message was from a known server. @@ -119,7 +166,6 @@ class ServerManager implements HConstants { } } } - HServerInfo storedInfo = serversToServerInfo.remove(s); if (storedInfo != null && !master.closed.get()) { // The startup message was from a known server with the same name. @@ -137,7 +183,6 @@ class ServerManager implements HConstants { LOG.error("Insertion into toDoQueue was interrupted", e); } } - // record new server load = new HServerLoad(); serverInfo.setLoad(load); @@ -703,7 +748,7 @@ class ServerManager implements HConstants { } } } - deadServers.add(server); + deadServers.put(server, Boolean.FALSE); try { master.toDoQueue.put( new ProcessServerShutdown(master, info, rootServer)); @@ -742,6 +787,23 @@ class ServerManager implements HConstants { * @return true if server is dead */ public boolean isDead(String serverName) { - return deadServers.contains(serverName); + return deadServers.containsKey(serverName); } -} \ No newline at end of file + + /** + * @param serverName + * @return True if this is a dead server and it has had its logs split. + */ + public boolean isDeadServerLogsSplit(final String serverName) { + Boolean b = this.deadServers.get(serverName); + return b == null? false: b.booleanValue(); + } + + /** + * Set that this deadserver has had its log split. + * @param serverName + */ + public void setDeadServerLogsSplit(final String serverName) { + this.deadServers.put(serverName, Boolean.TRUE); + } +}