From 6e4efdf6970d9e3aef1a1517d92169e9f916e8b6 Mon Sep 17 00:00:00 2001 From: Jim Kellerman Date: Fri, 5 Oct 2007 09:39:32 +0000 Subject: [PATCH] HADOOP-1937 When the master times out a region server's lease, it is too aggressive in reclaiming the server's log. git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk/src/contrib/hbase@582165 13f79535-47bb-0310-9956-ffa450edef68 --- CHANGES.txt | 2 + src/java/org/apache/hadoop/hbase/HMaster.java | 76 +++++++++++-------- .../apache/hadoop/hbase/HRegionServer.java | 55 +++++++++----- 3 files changed, 81 insertions(+), 52 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index b07117b73a2..75dbac4954a 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -66,6 +66,8 @@ Trunk (unreleased changes) HADOOP-1975 HBase tests failing with java.lang.NumberFormatException HADOOP-1990 Regression test instability affects nightly and patch builds HADOOP-1996 TestHStoreFile fails on windows if run multiple times + HADOOP-1937 When the master times out a region server's lease, it is too + aggressive in reclaiming the server's log. IMPROVEMENTS HADOOP-1737 Make HColumnDescriptor data publically members settable diff --git a/src/java/org/apache/hadoop/hbase/HMaster.java b/src/java/org/apache/hadoop/hbase/HMaster.java index 68e734eec61..c9b9a1b07d3 100644 --- a/src/java/org/apache/hadoop/hbase/HMaster.java +++ b/src/java/org/apache/hadoop/hbase/HMaster.java @@ -35,6 +35,8 @@ import java.util.TimerTask; import java.util.TreeMap; import java.util.TreeSet; import java.util.concurrent.BlockingQueue; +import java.util.concurrent.Delayed; +import java.util.concurrent.DelayQueue; import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; @@ -99,8 +101,10 @@ HMasterRegionInterface { int numRetries; long maxRegionOpenTime; + DelayQueue shutdownQueue; BlockingQueue msgQueue; + int leaseTimeout; private Leases serverLeases; private Server server; private HServerAddress address; @@ -860,11 +864,12 @@ HMasterRegionInterface { this.numRetries = conf.getInt("hbase.client.retries.number", 2); this.maxRegionOpenTime = conf.getLong("hbase.hbasemaster.maxregionopen", 30 * 1000); - + + this.shutdownQueue = new DelayQueue(); this.msgQueue = new LinkedBlockingQueue(); - - this.serverLeases = new Leases( - conf.getInt("hbase.master.lease.period", 30 * 1000), + + this.leaseTimeout = conf.getInt("hbase.master.lease.period", 30 * 1000); + this.serverLeases = new Leases(this.leaseTimeout, conf.getInt("hbase.master.lease.thread.wakefrequency", 15 * 1000)); this.server = RPC.getServer(this, address.getBindAddress(), @@ -966,10 +971,13 @@ HMasterRegionInterface { */ try { for (PendingOperation op = null; !closed.get(); ) { - try { - op = msgQueue.poll(threadWakeFrequency, TimeUnit.MILLISECONDS); - } catch (InterruptedException e) { - // continue + op = shutdownQueue.poll(); + if (op == null ) { + try { + op = msgQueue.poll(threadWakeFrequency, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + // continue + } } if (op == null || closed.get()) { continue; @@ -1117,6 +1125,7 @@ HMasterRegionInterface { * HMasterRegionInterface */ + /** {@inheritDoc} */ @SuppressWarnings("unused") public MapWritable regionServerStartup(HServerInfo serverInfo) throws IOException { @@ -1140,11 +1149,7 @@ HMasterRegionInterface { serversToServerInfo.notifyAll(); } if (storedInfo != null && !closed.get()) { - try { - msgQueue.put(new PendingServerShutdown(storedInfo)); - } catch (InterruptedException e) { - throw new RuntimeException("Putting into msgQueue was interrupted.", e); - } + shutdownQueue.put(new PendingServerShutdown(storedInfo)); } // Either way, record the new server @@ -1683,9 +1688,12 @@ HMasterRegionInterface { * The region server's log file needs to be split up for each region it was * serving, and the regions need to get reassigned. */ - private class PendingServerShutdown extends PendingOperation { + private class PendingServerShutdown extends PendingOperation + implements Delayed { + private long delay; private HServerAddress deadServer; private String deadServerName; + private Path oldLogDir; private transient boolean logSplit; private transient boolean rootChecked; private transient boolean rootRescanned; @@ -1706,13 +1714,32 @@ HMasterRegionInterface { PendingServerShutdown(HServerInfo serverInfo) { super(); + this.delay = leaseTimeout / 2; this.deadServer = serverInfo.getServerAddress(); this.deadServerName = this.deadServer.toString(); this.logSplit = false; this.rootChecked = false; this.rootRescanned = false; + StringBuilder dirName = new StringBuilder("log_"); + dirName.append(deadServer.getBindAddress()); + dirName.append("_"); + dirName.append(serverInfo.getStartCode()); + dirName.append("_"); + dirName.append(deadServer.getPort()); + this.oldLogDir = new Path(dir, dirName.toString()); } + /** {@inheritDoc} */ + public long getDelay(TimeUnit unit) { + return unit.convert(delay, TimeUnit.MILLISECONDS); + } + + /** {@inheritDoc} */ + public int compareTo(Delayed o) { + return Long.valueOf(getDelay(TimeUnit.MILLISECONDS) + - o.getDelay(TimeUnit.MILLISECONDS)).intValue(); + } + /** {@inheritDoc} */ @Override public String toString() { @@ -1875,17 +1902,12 @@ HMasterRegionInterface { if (!logSplit) { // Process the old log file - StringBuilder dirName = new StringBuilder("log_"); - dirName.append(deadServer.getBindAddress()); - dirName.append("_"); - dirName.append(deadServer.getPort()); - Path logdir = new Path(dir, dirName.toString()); - if (fs.exists(logdir)) { + if (fs.exists(oldLogDir)) { if (!splitLogLock.tryLock()) { return false; } try { - HLog.splitLog(dir, logdir, fs, conf); + HLog.splitLog(dir, oldLogDir, fs, conf); } finally { splitLogLock.unlock(); } @@ -2901,16 +2923,8 @@ HMasterRegionInterface { // NOTE: If the server was serving the root region, we cannot reassign it // here because the new server will start serving the root region before // the PendingServerShutdown operation has a chance to split the log file. - try { - if (info != null) { - msgQueue.put(new PendingServerShutdown(info)); - } - } catch (InterruptedException e) { - // continue. We used to throw a RuntimeException here but on exit - // this put is often interrupted. For now, just log these iterrupts - // rather than throw an exception - LOG.debug("MsgQueue.put was interrupted (If we are exiting, this " + - "msg can be ignored)"); + if (info != null) { + shutdownQueue.put(new PendingServerShutdown(info)); } } } diff --git a/src/java/org/apache/hadoop/hbase/HRegionServer.java b/src/java/org/apache/hadoop/hbase/HRegionServer.java index 37a9bfff5ca..f7b5c85fd6b 100644 --- a/src/java/org/apache/hadoop/hbase/HRegionServer.java +++ b/src/java/org/apache/hadoop/hbase/HRegionServer.java @@ -96,7 +96,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable { protected final ReentrantReadWriteLock lock = new ReentrantReadWriteLock(); private final Vector outboundMsgs = new Vector(); - int numRetries; + final int numRetries; protected final int threadWakeFrequency; private final int msgInterval; private final int serverLeaseTimeout; @@ -314,7 +314,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable { } } } - + // HLog and HLog roller. log is protected rather than private to avoid // eclipse warning when accessed by inner classes protected HLog log; @@ -472,19 +472,27 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable { // get it when the master is panicing because for instance // the HDFS has been yanked out from under it. Be wary of // this message. - try { - if (checkFileSystem()) { - closeAllRegions(); - restart = true; + if (checkFileSystem()) { + closeAllRegions(); + synchronized (logRollerLock) { + try { + log.closeAndDelete(); + serverInfo.setStartCode(rand.nextLong()); + log = setupHLog(); + } catch (IOException e) { + this.abortRequested = true; + this.stopRequested.set(true); + e = RemoteExceptionHandler.checkIOException(e); + LOG.fatal("error restarting server", e); + break; + } } - } catch (Exception e) { + reportForDuty(); + restart = true; + } else { LOG.fatal("file system available check failed. " + - "Shutting down server.", e); - this.stopRequested.set(true); - this.fsOk = false; - this.abortRequested = true; + "Shutting down server."); } - break; case HMsg.MSG_REGIONSERVER_STOP: @@ -604,7 +612,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable { * Run init. Sets up hlog and starts up all server threads. * @param c Extra configuration. */ - private void init(final MapWritable c) { + private void init(final MapWritable c) throws IOException { try { for (Map.Entry e: c.entrySet()) { String key = e.getKey().toString(); @@ -618,18 +626,22 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable { startServiceThreads(); } catch (IOException e) { this.stopRequested.set(true); - LOG.fatal("Failed init", - RemoteExceptionHandler.checkIOException(e)); + e = RemoteExceptionHandler.checkIOException(e); + LOG.fatal("Failed init", e); + IOException ex = new IOException("region server startup failed"); + ex.initCause(e); + throw ex; } } - private HLog setupHLog() - throws RegionServerRunningException, IOException { + private HLog setupHLog() throws RegionServerRunningException, + IOException { + String rootDir = this.conf.get(HConstants.HBASE_DIR); LOG.info("Root dir: " + rootDir); - Path logdir = new Path(new Path(rootDir), - "log" + "_" + getThisIP() + "_" + - this.serverInfo.getServerAddress().getPort()); + Path logdir = new Path(new Path(rootDir), "log" + "_" + getThisIP() + "_" + + this.serverInfo.getStartCode() + "_" + + this.serverInfo.getServerAddress().getPort()); if (LOG.isDebugEnabled()) { LOG.debug("Log dir " + logdir); } @@ -762,6 +774,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable { } break; } catch(IOException e) { + LOG.warn("error telling master we are up", e); this.sleeper.sleep(lastMsg); continue; } @@ -1429,4 +1442,4 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable { public static void main(String [] args) { doMain(args, HRegionServer.class); } -} \ No newline at end of file +}