From 0165455d3490f9215fa75e8857cc4869632f2ac1 Mon Sep 17 00:00:00 2001 From: zhangduo Date: Fri, 5 Jan 2018 18:28:44 +0800 Subject: [PATCH] HBASE-19707 Race in start and terminate of a replication source after we async start replicatione endpoint --- .../RecoveredReplicationSource.java | 16 +- .../regionserver/ReplicationSource.java | 203 ++++++++++-------- 2 files changed, 116 insertions(+), 103 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSource.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSource.java index 1be9a88f456..3cae0f2d1f9 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSource.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/RecoveredReplicationSource.java @@ -68,7 +68,7 @@ public class RecoveredReplicationSource extends ReplicationSource { LOG.debug("Someone has beat us to start a worker thread for wal group " + walGroupId); } else { LOG.debug("Starting up worker for wal group " + walGroupId); - worker.startup(getUncaughtExceptionHandler()); + worker.startup(this::uncaughtException); worker.setWALReader( startNewWALReader(worker.getName(), walGroupId, queue, worker.getStartPosition())); workerThreads.put(walGroupId, worker); @@ -76,13 +76,13 @@ public class RecoveredReplicationSource extends ReplicationSource { } @Override - protected ReplicationSourceWALReader startNewWALReader(String threadName, - String walGroupId, PriorityBlockingQueue queue, long startPosition) { - ReplicationSourceWALReader walReader = new RecoveredReplicationSourceWALReader(fs, - conf, queue, startPosition, walEntryFilter, this); - Threads.setDaemonThreadRunning(walReader, threadName - + ".replicationSource.replicationWALReaderThread." + walGroupId + "," + queueId, - getUncaughtExceptionHandler()); + protected ReplicationSourceWALReader startNewWALReader(String threadName, String walGroupId, + PriorityBlockingQueue queue, long startPosition) { + ReplicationSourceWALReader walReader = + new RecoveredReplicationSourceWALReader(fs, conf, queue, startPosition, walEntryFilter, this); + Threads.setDaemonThreadRunning(walReader, + threadName + ".replicationSource.replicationWALReaderThread." + walGroupId + "," + queueId, + this::uncaughtException); return walReader; } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java index 0092251a4a1..09b6cc111cb 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/replication/regionserver/ReplicationSource.java @@ -75,7 +75,7 @@ import org.apache.hbase.thirdparty.com.google.common.collect.Lists; *

*/ @InterfaceAudience.Private -public class ReplicationSource extends Thread implements ReplicationSourceInterface { +public class ReplicationSource implements ReplicationSourceInterface { private static final Logger LOG = LoggerFactory.getLogger(ReplicationSource.class); // Queues of logs to process, entry in format of walGroupId->queue, @@ -114,10 +114,8 @@ public class ReplicationSource extends Thread implements ReplicationSourceInterf private MetricsSource metrics; // WARN threshold for the number of queued logs, defaults to 2 private int logQueueWarnThreshold; - // whether the replication endpoint has been initialized - private volatile boolean endpointInitialized = false; // ReplicationEndpoint which will handle the actual replication - private ReplicationEndpoint replicationEndpoint; + private volatile ReplicationEndpoint replicationEndpoint; // A filter (or a chain of filters) for the WAL entries. protected WALEntryFilter walEntryFilter; // throttler @@ -135,6 +133,8 @@ public class ReplicationSource extends Thread implements ReplicationSourceInterf public static final int DEFAULT_WAIT_ON_ENDPOINT_SECONDS = 30; private int waitOnEndpointSeconds = -1; + private Thread initThread; + /** * Instantiation method used by region servers * @param conf configuration to use @@ -196,7 +196,7 @@ public class ReplicationSource extends Thread implements ReplicationSourceInterf if (queue == null) { queue = new PriorityBlockingQueue<>(queueSizePerGroup, new LogsComparator()); queues.put(logPrefix, queue); - if (this.isSourceActive() && this.endpointInitialized) { + if (this.isSourceActive() && this.replicationEndpoint != null) { // new wal group observed after source startup, start a new worker thread to track it // notice: it's possible that log enqueued when this.running is set but worker thread // still not launched, so it's necessary to check workerThreads before start the worker @@ -235,28 +235,36 @@ public class ReplicationSource extends Thread implements ReplicationSourceInterf } } - private void initAndStartReplicationEndpoint() throws Exception { + private ReplicationEndpoint createReplicationEndpoint() + throws InstantiationException, IllegalAccessException, ClassNotFoundException, IOException { RegionServerCoprocessorHost rsServerHost = null; - TableDescriptors tableDescriptors = null; if (server instanceof HRegionServer) { rsServerHost = ((HRegionServer) server).getRegionServerCoprocessorHost(); - tableDescriptors = ((HRegionServer) server).getTableDescriptors(); } String replicationEndpointImpl = replicationPeer.getPeerConfig().getReplicationEndpointImpl(); if (replicationEndpointImpl == null) { // Default to HBase inter-cluster replication endpoint replicationEndpointImpl = HBaseInterClusterReplicationEndpoint.class.getName(); } - replicationEndpoint = - Class.forName(replicationEndpointImpl).asSubclass(ReplicationEndpoint.class).newInstance(); + ReplicationEndpoint replicationEndpoint = + Class.forName(replicationEndpointImpl).asSubclass(ReplicationEndpoint.class).newInstance(); if (rsServerHost != null) { ReplicationEndpoint newReplicationEndPoint = - rsServerHost.postCreateReplicationEndPoint(replicationEndpoint); + rsServerHost.postCreateReplicationEndPoint(replicationEndpoint); if (newReplicationEndPoint != null) { // Override the newly created endpoint from the hook with configured end point replicationEndpoint = newReplicationEndPoint; } } + return replicationEndpoint; + } + + private void initAndStartReplicationEndpoint(ReplicationEndpoint replicationEndpoint) + throws IOException, TimeoutException { + TableDescriptors tableDescriptors = null; + if (server instanceof HRegionServer) { + tableDescriptors = ((HRegionServer) server).getTableDescriptors(); + } replicationEndpoint .init(new ReplicationEndpoint.Context(conf, replicationPeer.getConfiguration(), fs, peerId, clusterId, replicationPeer, metrics, tableDescriptors, server)); @@ -264,60 +272,6 @@ public class ReplicationSource extends Thread implements ReplicationSourceInterf replicationEndpoint.awaitRunning(waitOnEndpointSeconds, TimeUnit.SECONDS); } - @Override - public void run() { - // mark we are running now - this.sourceRunning = true; - - int sleepMultiplier = 1; - while (this.isSourceActive()) { - try { - initAndStartReplicationEndpoint(); - break; - } catch (Exception e) { - LOG.warn("Error starting ReplicationEndpoint, retrying", e); - if (replicationEndpoint != null) { - replicationEndpoint.stop(); - replicationEndpoint = null; - } - if (sleepForRetries("Error starting ReplicationEndpoint", sleepMultiplier)) { - sleepMultiplier++; - } - } - } - this.endpointInitialized = true; - - sleepMultiplier = 1; - // delay this until we are in an asynchronous thread - while (this.isSourceActive() && this.peerClusterId == null) { - this.peerClusterId = replicationEndpoint.getPeerUUID(); - if (this.isSourceActive() && this.peerClusterId == null) { - if (sleepForRetries("Cannot contact the peer's zk ensemble", sleepMultiplier)) { - sleepMultiplier++; - } - } - } - - // In rare case, zookeeper setting may be messed up. That leads to the incorrect - // peerClusterId value, which is the same as the source clusterId - if (clusterId.equals(peerClusterId) && !replicationEndpoint.canReplicateToSameCluster()) { - this.terminate("ClusterId " + clusterId + " is replicating to itself: peerClusterId " - + peerClusterId + " which is not allowed by ReplicationEndpoint:" - + replicationEndpoint.getClass().getName(), null, false); - this.manager.removeSource(this); - return; - } - LOG.info("Replicating " + clusterId + " -> " + peerClusterId); - - initializeWALEntryFilter(); - // start workers - for (Map.Entry> entry : queues.entrySet()) { - String walGroupId = entry.getKey(); - PriorityBlockingQueue queue = entry.getValue(); - tryStartNewShipper(walGroupId, queue); - } - } - private void initializeWALEntryFilter() { // get the WALEntryFilter from ReplicationEndpoint and add it to default filters ArrayList filters = @@ -331,37 +285,31 @@ public class ReplicationSource extends Thread implements ReplicationSourceInterf } protected void tryStartNewShipper(String walGroupId, PriorityBlockingQueue queue) { - final ReplicationSourceShipper worker = new ReplicationSourceShipper(conf, - walGroupId, queue, this); + ReplicationSourceShipper worker = new ReplicationSourceShipper(conf, walGroupId, queue, this); ReplicationSourceShipper extant = workerThreads.putIfAbsent(walGroupId, worker); if (extant != null) { LOG.debug("Someone has beat us to start a worker thread for wal group " + walGroupId); } else { LOG.debug("Starting up worker for wal group " + walGroupId); - worker.startup(getUncaughtExceptionHandler()); - worker.setWALReader(startNewWALReader(worker.getName(), walGroupId, queue, - worker.getStartPosition())); + worker.startup(this::uncaughtException); + worker.setWALReader( + startNewWALReader(worker.getName(), walGroupId, queue, worker.getStartPosition())); } } protected ReplicationSourceWALReader startNewWALReader(String threadName, String walGroupId, PriorityBlockingQueue queue, long startPosition) { ReplicationSourceWALReader walReader = - new ReplicationSourceWALReader(fs, conf, queue, startPosition, walEntryFilter, this); + new ReplicationSourceWALReader(fs, conf, queue, startPosition, walEntryFilter, this); return (ReplicationSourceWALReader) Threads.setDaemonThreadRunning(walReader, threadName + ".replicationSource.wal-reader." + walGroupId + "," + queueId, - getUncaughtExceptionHandler()); + this::uncaughtException); } - public Thread.UncaughtExceptionHandler getUncaughtExceptionHandler() { - return new Thread.UncaughtExceptionHandler() { - @Override - public void uncaughtException(final Thread t, final Throwable e) { - RSRpcServices.exitIfOOME(e); - LOG.error("Unexpected exception in " + t.getName() + " currentPath=" + getCurrentPath(), e); - server.stop("Unexpected exception in " + t.getName()); - } - }; + protected final void uncaughtException(Thread t, Throwable e) { + RSRpcServices.exitIfOOME(e); + LOG.error("Unexpected exception in " + t.getName() + " currentPath=" + getCurrentPath(), e); + server.abort("Unexpected exception in " + t.getName(), e); } @Override @@ -434,17 +382,76 @@ public class ReplicationSource extends Thread implements ReplicationSourceInterf return replicationPeer.isPeerEnabled(); } + private void initialize() { + int sleepMultiplier = 1; + while (this.isSourceActive()) { + ReplicationEndpoint replicationEndpoint; + try { + replicationEndpoint = createReplicationEndpoint(); + } catch (Exception e) { + LOG.warn("error creating ReplicationEndpoint, retry", e); + if (sleepForRetries("Error creating ReplicationEndpoint", sleepMultiplier)) { + sleepMultiplier++; + } + continue; + } + + try { + initAndStartReplicationEndpoint(replicationEndpoint); + this.replicationEndpoint = replicationEndpoint; + break; + } catch (Exception e) { + LOG.warn("Error starting ReplicationEndpoint, retry", e); + replicationEndpoint.stop(); + if (sleepForRetries("Error starting ReplicationEndpoint", sleepMultiplier)) { + sleepMultiplier++; + } + } + } + + if (!this.isSourceActive()) { + return; + } + + sleepMultiplier = 1; + // delay this until we are in an asynchronous thread + while (this.isSourceActive() && this.peerClusterId == null) { + this.peerClusterId = replicationEndpoint.getPeerUUID(); + if (this.isSourceActive() && this.peerClusterId == null) { + if (sleepForRetries("Cannot contact the peer's zk ensemble", sleepMultiplier)) { + sleepMultiplier++; + } + } + } + + // In rare case, zookeeper setting may be messed up. That leads to the incorrect + // peerClusterId value, which is the same as the source clusterId + if (clusterId.equals(peerClusterId) && !replicationEndpoint.canReplicateToSameCluster()) { + this.terminate("ClusterId " + clusterId + " is replicating to itself: peerClusterId " + + peerClusterId + " which is not allowed by ReplicationEndpoint:" + + replicationEndpoint.getClass().getName(), null, false); + this.manager.removeSource(this); + return; + } + LOG.info("Replicating " + clusterId + " -> " + peerClusterId); + + initializeWALEntryFilter(); + // start workers + for (Map.Entry> entry : queues.entrySet()) { + String walGroupId = entry.getKey(); + PriorityBlockingQueue queue = entry.getValue(); + tryStartNewShipper(walGroupId, queue); + } + } + @Override public void startup() { - String n = Thread.currentThread().getName(); - Thread.UncaughtExceptionHandler handler = new Thread.UncaughtExceptionHandler() { - @Override - public void uncaughtException(final Thread t, final Throwable e) { - LOG.error("Unexpected exception in ReplicationSource", e); - } - }; - Threads.setDaemonThreadRunning(this, n + ".replicationSource," + this.queueId, - handler); + // mark we are running now + this.sourceRunning = true; + initThread = new Thread(this::initialize); + Threads.setDaemonThreadRunning(initThread, + Thread.currentThread().getName() + ".replicationSource," + this.queueId, + this::uncaughtException); } @Override @@ -465,6 +472,13 @@ public class ReplicationSource extends Thread implements ReplicationSourceInterf cause); } this.sourceRunning = false; + if (initThread != null && Thread.currentThread() != initThread) { + // This usually won't happen but anyway, let's wait until the initialization thread exits. + // And notice that we may call terminate directly from the initThread so here we need to + // avoid join on ourselves. + initThread.interrupt(); + Threads.shutdown(initThread, this.sleepForRetries); + } Collection workers = workerThreads.values(); for (ReplicationSourceShipper worker : workers) { worker.stopWorker(); @@ -481,12 +495,11 @@ public class ReplicationSource extends Thread implements ReplicationSourceInterf } if (this.replicationEndpoint != null) { try { - this.replicationEndpoint - .awaitTerminated(sleepForRetries * maxRetriesMultiplier, TimeUnit.MILLISECONDS); + this.replicationEndpoint.awaitTerminated(sleepForRetries * maxRetriesMultiplier, + TimeUnit.MILLISECONDS); } catch (TimeoutException te) { - LOG.warn("Got exception while waiting for endpoint to shutdown for replication source :" - + this.queueId, - te); + LOG.warn("Got exception while waiting for endpoint to shutdown for replication source :" + + this.queueId, te); } } }