From 3f485568a92450e1e1ca6148a3db1dc85b9dbaf1 Mon Sep 17 00:00:00 2001 From: nkeywal Date: Thu, 20 Feb 2014 15:16:46 +0000 Subject: [PATCH] HBASE-10524 Correct wrong handling and add proper handling for swallowed InterruptedException thrown by Thread.sleep in regionserver (Feng Honghua) git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1570219 13f79535-47bb-0310-9956-ffa450edef68 --- .../hbase/regionserver/HRegionServer.java | 79 ++++++++------- .../hbase/regionserver/SplitLogWorker.java | 96 ++++++++----------- 2 files changed, 84 insertions(+), 91 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java index ea397917bd4..96208e0ee5c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java @@ -1939,46 +1939,55 @@ public class HRegionServer implements ClientProtos.ClientService.BlockingInterfa RegionServerStatusService.BlockingInterface master = null; boolean refresh = false; // for the first time, use cached data RegionServerStatusService.BlockingInterface intf = null; - while (keepLooping() && master == null) { - sn = this.masterAddressManager.getMasterAddress(refresh); - if (sn == null) { - if (!keepLooping()) { - // give up with no connection. - LOG.debug("No master found and cluster is stopped; bailing out"); - return null; + boolean interrupted = false; + try { + while (keepLooping() && master == null) { + sn = this.masterAddressManager.getMasterAddress(refresh); + if (sn == null) { + if (!keepLooping()) { + // give up with no connection. + LOG.debug("No master found and cluster is stopped; bailing out"); + return null; + } + LOG.debug("No master found; retry"); + previousLogTime = System.currentTimeMillis(); + refresh = true; // let's try pull it from ZK directly + sleeper.sleep(); + continue; } - LOG.debug("No master found; retry"); - previousLogTime = System.currentTimeMillis(); - refresh = true; // let's try pull it from ZK directly - sleeper.sleep(); - continue; - } - new InetSocketAddress(sn.getHostname(), sn.getPort()); - try { - BlockingRpcChannel channel = - this.rpcClient.createBlockingRpcChannel(sn, userProvider.getCurrent(), this.rpcTimeout); - intf = RegionServerStatusService.newBlockingStub(channel); - break; - } catch (IOException e) { - e = e instanceof RemoteException ? - ((RemoteException)e).unwrapRemoteException() : e; - if (e instanceof ServerNotRunningYetException) { - if (System.currentTimeMillis() > (previousLogTime+1000)){ - LOG.info("Master isn't available yet, retrying"); - previousLogTime = System.currentTimeMillis(); - } - } else { - if (System.currentTimeMillis() > (previousLogTime + 1000)) { - LOG.warn("Unable to connect to master. Retrying. Error was:", e); - previousLogTime = System.currentTimeMillis(); - } - } + new InetSocketAddress(sn.getHostname(), sn.getPort()); try { - Thread.sleep(200); - } catch (InterruptedException ignored) { + BlockingRpcChannel channel = + this.rpcClient.createBlockingRpcChannel(sn, userProvider.getCurrent(), this.rpcTimeout); + intf = RegionServerStatusService.newBlockingStub(channel); + break; + } catch (IOException e) { + e = e instanceof RemoteException ? + ((RemoteException)e).unwrapRemoteException() : e; + if (e instanceof ServerNotRunningYetException) { + if (System.currentTimeMillis() > (previousLogTime+1000)){ + LOG.info("Master isn't available yet, retrying"); + previousLogTime = System.currentTimeMillis(); + } + } else { + if (System.currentTimeMillis() > (previousLogTime + 1000)) { + LOG.warn("Unable to connect to master. Retrying. Error was:", e); + previousLogTime = System.currentTimeMillis(); + } + } + try { + Thread.sleep(200); + } catch (InterruptedException ex) { + interrupted = true; + LOG.warn("Interrupted while sleeping"); + } } } + } finally { + if (interrupted) { + Thread.currentThread().interrupt(); + } } return new Pair(sn, intf); } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java index 4a83741a0e9..77265351a5d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/SplitLogWorker.java @@ -50,6 +50,7 @@ import org.apache.hadoop.hbase.regionserver.wal.HLogSplitter; import org.apache.hadoop.hbase.regionserver.wal.HLogUtil; import org.apache.hadoop.hbase.util.CancelableProgressable; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.apache.hadoop.hbase.util.ExceptionUtil; import org.apache.hadoop.hbase.util.FSUtils; import org.apache.hadoop.hbase.zookeeper.ZKSplitLog; import org.apache.hadoop.hbase.zookeeper.ZKUtil; @@ -190,26 +191,23 @@ public class SplitLogWorker extends ZooKeeperListener implements Runnable { LOG.warn("Exception when checking for " + watcher.splitLogZNode + " ... retrying", e); } if (res == -1) { - try { - LOG.info(watcher.splitLogZNode + " znode does not exist, waiting for master to create"); - Thread.sleep(1000); - } catch (InterruptedException e) { - LOG.debug("Interrupted while waiting for " + watcher.splitLogZNode - + (exitWorker ? "" : " (ERROR: exitWorker is not set, " + - "exiting anyway)")); - exitWorker = true; - break; - } + LOG.info(watcher.splitLogZNode + " znode does not exist, waiting for master to create"); + Thread.sleep(1000); } } if (!exitWorker) { - taskLoop(); + taskLoop(); } } catch (Throwable t) { - // only a logical error can cause here. Printing it out - // to make debugging easier - LOG.error("unexpected error ", t); + if (ExceptionUtil.isInterrupt(t)) { + LOG.info("SplitLogWorker interrupted. Exiting. " + (exitWorker ? "" : + " (ERROR: exitWorker is not set, exiting anyway)")); + } else { + // only a logical error can cause here. Printing it out + // to make debugging easier + LOG.error("unexpected error ", t); + } } finally { LOG.info("SplitLogWorker " + this.serverName + " exiting"); } @@ -223,7 +221,7 @@ public class SplitLogWorker extends ZooKeeperListener implements Runnable { * Synchronization using {@link #taskReadyLock} ensures that it will * try to grab every task that has been put up */ - private void taskLoop() { + private void taskLoop() throws InterruptedException { while (!exitWorker) { int seq_start = taskReadySeq; List paths = getTaskList(); @@ -259,50 +257,41 @@ public class SplitLogWorker extends ZooKeeperListener implements Runnable { SplitLogCounters.tot_wkr_task_grabing.incrementAndGet(); synchronized (taskReadyLock) { while (seq_start == taskReadySeq) { - try { - taskReadyLock.wait(checkInterval); - if (this.server != null) { - // check to see if we have stale recovering regions in our internal memory state - Map recoveringRegions = this.server.getRecoveringRegions(); - if (!recoveringRegions.isEmpty()) { - // Make a local copy to prevent ConcurrentModificationException when other threads - // modify recoveringRegions - List tmpCopy = new ArrayList(recoveringRegions.keySet()); - for (String region : tmpCopy) { - String nodePath = ZKUtil.joinZNode(this.watcher.recoveringRegionsZNode, region); - try { - if (ZKUtil.checkExists(this.watcher, nodePath) == -1) { - HRegion r = recoveringRegions.remove(region); - if (r != null) { - r.setRecovering(false); - } - LOG.debug("Mark recovering region:" + region + " up."); - } else { - // current check is a defensive(or redundant) mechanism to prevent us from - // having stale recovering regions in our internal RS memory state while - // zookeeper(source of truth) says differently. We stop at the first good one - // because we should not have a single instance such as this in normal case so - // check the first one is good enough. - break; + taskReadyLock.wait(checkInterval); + if (this.server != null) { + // check to see if we have stale recovering regions in our internal memory state + Map recoveringRegions = this.server.getRecoveringRegions(); + if (!recoveringRegions.isEmpty()) { + // Make a local copy to prevent ConcurrentModificationException when other threads + // modify recoveringRegions + List tmpCopy = new ArrayList(recoveringRegions.keySet()); + for (String region : tmpCopy) { + String nodePath = ZKUtil.joinZNode(this.watcher.recoveringRegionsZNode, region); + try { + if (ZKUtil.checkExists(this.watcher, nodePath) == -1) { + HRegion r = recoveringRegions.remove(region); + if (r != null) { + r.setRecovering(false); } - } catch (KeeperException e) { - // ignore zookeeper error - LOG.debug("Got a zookeeper when trying to open a recovering region", e); + LOG.debug("Mark recovering region:" + region + " up."); + } else { + // current check is a defensive(or redundant) mechanism to prevent us from + // having stale recovering regions in our internal RS memory state while + // zookeeper(source of truth) says differently. We stop at the first good one + // because we should not have a single instance such as this in normal case so + // check the first one is good enough. break; } + } catch (KeeperException e) { + // ignore zookeeper error + LOG.debug("Got a zookeeper when trying to open a recovering region", e); + break; } } } - } catch (InterruptedException e) { - LOG.info("SplitLogWorker interrupted while waiting for task," + - " exiting: " + e.toString() + (exitWorker ? "" : - " (ERROR: exitWorker is not set, exiting anyway)")); - exitWorker = true; - return; } } } - } } @@ -559,7 +548,7 @@ public class SplitLogWorker extends ZooKeeperListener implements Runnable { } - private List getTaskList() { + private List getTaskList() throws InterruptedException { List childrenPaths = null; long sleepTime = 1000; // It will be in loop till it gets the list of children or @@ -575,14 +564,9 @@ public class SplitLogWorker extends ZooKeeperListener implements Runnable { LOG.warn("Could not get children of znode " + this.watcher.splitLogZNode, e); } - try { LOG.debug("Retry listChildren of znode " + this.watcher.splitLogZNode + " after sleep for " + sleepTime + "ms!"); Thread.sleep(sleepTime); - } catch (InterruptedException e1) { - LOG.warn("Interrupted while trying to get task list ...", e1); - Thread.currentThread().interrupt(); - } } return childrenPaths; }