HADOOP-1937 When the master times out a region server's lease, it is too aggressive in reclaiming the server's log.
git-svn-id: https://svn.apache.org/repos/asf/lucene/hadoop/trunk/src/contrib/hbase@582165 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
1acbfbde1f
commit
6e4efdf697
|
@ -66,6 +66,8 @@ Trunk (unreleased changes)
|
|||
HADOOP-1975 HBase tests failing with java.lang.NumberFormatException
|
||||
HADOOP-1990 Regression test instability affects nightly and patch builds
|
||||
HADOOP-1996 TestHStoreFile fails on windows if run multiple times
|
||||
HADOOP-1937 When the master times out a region server's lease, it is too
|
||||
aggressive in reclaiming the server's log.
|
||||
|
||||
IMPROVEMENTS
|
||||
HADOOP-1737 Make HColumnDescriptor data publically members settable
|
||||
|
|
|
@ -35,6 +35,8 @@ import java.util.TimerTask;
|
|||
import java.util.TreeMap;
|
||||
import java.util.TreeSet;
|
||||
import java.util.concurrent.BlockingQueue;
|
||||
import java.util.concurrent.Delayed;
|
||||
import java.util.concurrent.DelayQueue;
|
||||
import java.util.concurrent.LinkedBlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
|
@ -99,8 +101,10 @@ HMasterRegionInterface {
|
|||
int numRetries;
|
||||
long maxRegionOpenTime;
|
||||
|
||||
DelayQueue<PendingServerShutdown> shutdownQueue;
|
||||
BlockingQueue<PendingOperation> msgQueue;
|
||||
|
||||
int leaseTimeout;
|
||||
private Leases serverLeases;
|
||||
private Server server;
|
||||
private HServerAddress address;
|
||||
|
@ -860,11 +864,12 @@ HMasterRegionInterface {
|
|||
this.numRetries = conf.getInt("hbase.client.retries.number", 2);
|
||||
this.maxRegionOpenTime =
|
||||
conf.getLong("hbase.hbasemaster.maxregionopen", 30 * 1000);
|
||||
|
||||
|
||||
this.shutdownQueue = new DelayQueue<PendingServerShutdown>();
|
||||
this.msgQueue = new LinkedBlockingQueue<PendingOperation>();
|
||||
|
||||
this.serverLeases = new Leases(
|
||||
conf.getInt("hbase.master.lease.period", 30 * 1000),
|
||||
|
||||
this.leaseTimeout = conf.getInt("hbase.master.lease.period", 30 * 1000);
|
||||
this.serverLeases = new Leases(this.leaseTimeout,
|
||||
conf.getInt("hbase.master.lease.thread.wakefrequency", 15 * 1000));
|
||||
|
||||
this.server = RPC.getServer(this, address.getBindAddress(),
|
||||
|
@ -966,10 +971,13 @@ HMasterRegionInterface {
|
|||
*/
|
||||
try {
|
||||
for (PendingOperation op = null; !closed.get(); ) {
|
||||
try {
|
||||
op = msgQueue.poll(threadWakeFrequency, TimeUnit.MILLISECONDS);
|
||||
} catch (InterruptedException e) {
|
||||
// continue
|
||||
op = shutdownQueue.poll();
|
||||
if (op == null ) {
|
||||
try {
|
||||
op = msgQueue.poll(threadWakeFrequency, TimeUnit.MILLISECONDS);
|
||||
} catch (InterruptedException e) {
|
||||
// continue
|
||||
}
|
||||
}
|
||||
if (op == null || closed.get()) {
|
||||
continue;
|
||||
|
@ -1117,6 +1125,7 @@ HMasterRegionInterface {
|
|||
* HMasterRegionInterface
|
||||
*/
|
||||
|
||||
/** {@inheritDoc} */
|
||||
@SuppressWarnings("unused")
|
||||
public MapWritable regionServerStartup(HServerInfo serverInfo)
|
||||
throws IOException {
|
||||
|
@ -1140,11 +1149,7 @@ HMasterRegionInterface {
|
|||
serversToServerInfo.notifyAll();
|
||||
}
|
||||
if (storedInfo != null && !closed.get()) {
|
||||
try {
|
||||
msgQueue.put(new PendingServerShutdown(storedInfo));
|
||||
} catch (InterruptedException e) {
|
||||
throw new RuntimeException("Putting into msgQueue was interrupted.", e);
|
||||
}
|
||||
shutdownQueue.put(new PendingServerShutdown(storedInfo));
|
||||
}
|
||||
|
||||
// Either way, record the new server
|
||||
|
@ -1683,9 +1688,12 @@ HMasterRegionInterface {
|
|||
* The region server's log file needs to be split up for each region it was
|
||||
* serving, and the regions need to get reassigned.
|
||||
*/
|
||||
private class PendingServerShutdown extends PendingOperation {
|
||||
private class PendingServerShutdown extends PendingOperation
|
||||
implements Delayed {
|
||||
private long delay;
|
||||
private HServerAddress deadServer;
|
||||
private String deadServerName;
|
||||
private Path oldLogDir;
|
||||
private transient boolean logSplit;
|
||||
private transient boolean rootChecked;
|
||||
private transient boolean rootRescanned;
|
||||
|
@ -1706,13 +1714,32 @@ HMasterRegionInterface {
|
|||
|
||||
PendingServerShutdown(HServerInfo serverInfo) {
|
||||
super();
|
||||
this.delay = leaseTimeout / 2;
|
||||
this.deadServer = serverInfo.getServerAddress();
|
||||
this.deadServerName = this.deadServer.toString();
|
||||
this.logSplit = false;
|
||||
this.rootChecked = false;
|
||||
this.rootRescanned = false;
|
||||
StringBuilder dirName = new StringBuilder("log_");
|
||||
dirName.append(deadServer.getBindAddress());
|
||||
dirName.append("_");
|
||||
dirName.append(serverInfo.getStartCode());
|
||||
dirName.append("_");
|
||||
dirName.append(deadServer.getPort());
|
||||
this.oldLogDir = new Path(dir, dirName.toString());
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
public long getDelay(TimeUnit unit) {
|
||||
return unit.convert(delay, TimeUnit.MILLISECONDS);
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
public int compareTo(Delayed o) {
|
||||
return Long.valueOf(getDelay(TimeUnit.MILLISECONDS)
|
||||
- o.getDelay(TimeUnit.MILLISECONDS)).intValue();
|
||||
}
|
||||
|
||||
/** {@inheritDoc} */
|
||||
@Override
|
||||
public String toString() {
|
||||
|
@ -1875,17 +1902,12 @@ HMasterRegionInterface {
|
|||
|
||||
if (!logSplit) {
|
||||
// Process the old log file
|
||||
StringBuilder dirName = new StringBuilder("log_");
|
||||
dirName.append(deadServer.getBindAddress());
|
||||
dirName.append("_");
|
||||
dirName.append(deadServer.getPort());
|
||||
Path logdir = new Path(dir, dirName.toString());
|
||||
if (fs.exists(logdir)) {
|
||||
if (fs.exists(oldLogDir)) {
|
||||
if (!splitLogLock.tryLock()) {
|
||||
return false;
|
||||
}
|
||||
try {
|
||||
HLog.splitLog(dir, logdir, fs, conf);
|
||||
HLog.splitLog(dir, oldLogDir, fs, conf);
|
||||
} finally {
|
||||
splitLogLock.unlock();
|
||||
}
|
||||
|
@ -2901,16 +2923,8 @@ HMasterRegionInterface {
|
|||
// NOTE: If the server was serving the root region, we cannot reassign it
|
||||
// here because the new server will start serving the root region before
|
||||
// the PendingServerShutdown operation has a chance to split the log file.
|
||||
try {
|
||||
if (info != null) {
|
||||
msgQueue.put(new PendingServerShutdown(info));
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
// continue. We used to throw a RuntimeException here but on exit
|
||||
// this put is often interrupted. For now, just log these iterrupts
|
||||
// rather than throw an exception
|
||||
LOG.debug("MsgQueue.put was interrupted (If we are exiting, this " +
|
||||
"msg can be ignored)");
|
||||
if (info != null) {
|
||||
shutdownQueue.put(new PendingServerShutdown(info));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -96,7 +96,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
|
|||
protected final ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
|
||||
private final Vector<HMsg> outboundMsgs = new Vector<HMsg>();
|
||||
|
||||
int numRetries;
|
||||
final int numRetries;
|
||||
protected final int threadWakeFrequency;
|
||||
private final int msgInterval;
|
||||
private final int serverLeaseTimeout;
|
||||
|
@ -314,7 +314,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// HLog and HLog roller. log is protected rather than private to avoid
|
||||
// eclipse warning when accessed by inner classes
|
||||
protected HLog log;
|
||||
|
@ -472,19 +472,27 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
|
|||
// get it when the master is panicing because for instance
|
||||
// the HDFS has been yanked out from under it. Be wary of
|
||||
// this message.
|
||||
try {
|
||||
if (checkFileSystem()) {
|
||||
closeAllRegions();
|
||||
restart = true;
|
||||
if (checkFileSystem()) {
|
||||
closeAllRegions();
|
||||
synchronized (logRollerLock) {
|
||||
try {
|
||||
log.closeAndDelete();
|
||||
serverInfo.setStartCode(rand.nextLong());
|
||||
log = setupHLog();
|
||||
} catch (IOException e) {
|
||||
this.abortRequested = true;
|
||||
this.stopRequested.set(true);
|
||||
e = RemoteExceptionHandler.checkIOException(e);
|
||||
LOG.fatal("error restarting server", e);
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
reportForDuty();
|
||||
restart = true;
|
||||
} else {
|
||||
LOG.fatal("file system available check failed. " +
|
||||
"Shutting down server.", e);
|
||||
this.stopRequested.set(true);
|
||||
this.fsOk = false;
|
||||
this.abortRequested = true;
|
||||
"Shutting down server.");
|
||||
}
|
||||
|
||||
break;
|
||||
|
||||
case HMsg.MSG_REGIONSERVER_STOP:
|
||||
|
@ -604,7 +612,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
|
|||
* Run init. Sets up hlog and starts up all server threads.
|
||||
* @param c Extra configuration.
|
||||
*/
|
||||
private void init(final MapWritable c) {
|
||||
private void init(final MapWritable c) throws IOException {
|
||||
try {
|
||||
for (Map.Entry<Writable, Writable> e: c.entrySet()) {
|
||||
String key = e.getKey().toString();
|
||||
|
@ -618,18 +626,22 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
|
|||
startServiceThreads();
|
||||
} catch (IOException e) {
|
||||
this.stopRequested.set(true);
|
||||
LOG.fatal("Failed init",
|
||||
RemoteExceptionHandler.checkIOException(e));
|
||||
e = RemoteExceptionHandler.checkIOException(e);
|
||||
LOG.fatal("Failed init", e);
|
||||
IOException ex = new IOException("region server startup failed");
|
||||
ex.initCause(e);
|
||||
throw ex;
|
||||
}
|
||||
}
|
||||
|
||||
private HLog setupHLog()
|
||||
throws RegionServerRunningException, IOException {
|
||||
private HLog setupHLog() throws RegionServerRunningException,
|
||||
IOException {
|
||||
|
||||
String rootDir = this.conf.get(HConstants.HBASE_DIR);
|
||||
LOG.info("Root dir: " + rootDir);
|
||||
Path logdir = new Path(new Path(rootDir),
|
||||
"log" + "_" + getThisIP() + "_" +
|
||||
this.serverInfo.getServerAddress().getPort());
|
||||
Path logdir = new Path(new Path(rootDir), "log" + "_" + getThisIP() + "_" +
|
||||
this.serverInfo.getStartCode() + "_" +
|
||||
this.serverInfo.getServerAddress().getPort());
|
||||
if (LOG.isDebugEnabled()) {
|
||||
LOG.debug("Log dir " + logdir);
|
||||
}
|
||||
|
@ -762,6 +774,7 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
|
|||
}
|
||||
break;
|
||||
} catch(IOException e) {
|
||||
LOG.warn("error telling master we are up", e);
|
||||
this.sleeper.sleep(lastMsg);
|
||||
continue;
|
||||
}
|
||||
|
@ -1429,4 +1442,4 @@ public class HRegionServer implements HConstants, HRegionInterface, Runnable {
|
|||
public static void main(String [] args) {
|
||||
doMain(args, HRegionServer.class);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue