HBASE-25627: HBase replication should have a metric to represent if the source is stuck getting initialized (#3009)
Signed-off-by: Bharath Vissapragada <bharathv@apache.org>
This commit is contained in:
parent
716c685497
commit
97c152ea7a
|
@ -50,8 +50,8 @@ public interface MetricsReplicationSourceSource extends BaseSource {
|
|||
public static final String SOURCE_COMPLETED_LOGS = "source.completedLogs";
|
||||
public static final String SOURCE_COMPLETED_RECOVERY_QUEUES = "source.completedRecoverQueues";
|
||||
public static final String SOURCE_FAILED_RECOVERY_QUEUES = "source.failedRecoverQueues";
|
||||
/* Used to track the age of oldest wal in ms since its creation time */
|
||||
String OLDEST_WAL_AGE = "source.oldestWalAge";
|
||||
// This is to track the num of replication sources getting initialized
|
||||
public static final String SOURCE_INITIALIZING = "source.numInitializing";
|
||||
|
||||
void setLastShippedAge(long age);
|
||||
void incrSizeOfLogQueue(int size);
|
||||
|
@ -78,4 +78,7 @@ public interface MetricsReplicationSourceSource extends BaseSource {
|
|||
void incrFailedRecoveryQueue();
|
||||
void setOldestWalAge(long age);
|
||||
long getOldestWalAge();
|
||||
void incrSourceInitializing();
|
||||
void decrSourceInitializing();
|
||||
int getSourceInitializing();
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.hadoop.hbase.replication.regionserver;
|
|||
|
||||
import org.apache.hadoop.hbase.metrics.MetricRegistryInfo;
|
||||
import org.apache.hadoop.metrics2.lib.MutableFastCounter;
|
||||
import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
|
||||
import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
|
||||
import org.apache.hadoop.metrics2.lib.MutableHistogram;
|
||||
|
||||
|
@ -46,6 +47,7 @@ public class MetricsReplicationGlobalSourceSource implements MetricsReplicationS
|
|||
private final MutableFastCounter completedWAL;
|
||||
private final MutableFastCounter completedRecoveryQueue;
|
||||
private final MutableFastCounter failedRecoveryQueue;
|
||||
private final MutableGaugeInt sourceInitializing;
|
||||
|
||||
public MetricsReplicationGlobalSourceSource(MetricsReplicationSourceImpl rms) {
|
||||
this.rms = rms;
|
||||
|
@ -82,6 +84,7 @@ public class MetricsReplicationGlobalSourceSource implements MetricsReplicationS
|
|||
completedRecoveryQueue = rms.getMetricsRegistry().getCounter(SOURCE_COMPLETED_RECOVERY_QUEUES, 0L);
|
||||
failedRecoveryQueue = rms.getMetricsRegistry()
|
||||
.getCounter(SOURCE_FAILED_RECOVERY_QUEUES, 0L);
|
||||
sourceInitializing = rms.getMetricsRegistry().getGaugeInt(SOURCE_INITIALIZING, 0);
|
||||
}
|
||||
|
||||
@Override public void setLastShippedAge(long age) {
|
||||
|
@ -208,6 +211,21 @@ public class MetricsReplicationGlobalSourceSource implements MetricsReplicationS
|
|||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void incrSourceInitializing() {
|
||||
sourceInitializing.incr(1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void decrSourceInitializing() {
|
||||
sourceInitializing.decr(1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getSourceInitializing() {
|
||||
return sourceInitializing.value();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void init() {
|
||||
rms.init();
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.hadoop.hbase.replication.regionserver;
|
|||
|
||||
import org.apache.hadoop.hbase.metrics.MetricRegistryInfo;
|
||||
import org.apache.hadoop.metrics2.lib.MutableFastCounter;
|
||||
import org.apache.hadoop.metrics2.lib.MutableGaugeInt;
|
||||
import org.apache.hadoop.metrics2.lib.MutableGaugeLong;
|
||||
import org.apache.hadoop.metrics2.lib.MutableHistogram;
|
||||
|
||||
|
@ -41,6 +42,7 @@ public class MetricsReplicationSourceSourceImpl implements MetricsReplicationSou
|
|||
private final String shippedHFilesKey;
|
||||
private final String sizeOfHFileRefsQueueKey;
|
||||
private final String oldestWalAgeKey;
|
||||
private final String sourceInitializingKey;
|
||||
|
||||
private final MutableHistogram ageOfLastShippedOpHist;
|
||||
private final MutableGaugeLong sizeOfLogQueueGauge;
|
||||
|
@ -69,6 +71,7 @@ public class MetricsReplicationSourceSourceImpl implements MetricsReplicationSou
|
|||
private final MutableFastCounter completedWAL;
|
||||
private final MutableFastCounter completedRecoveryQueue;
|
||||
private final MutableGaugeLong oldestWalAge;
|
||||
private final MutableGaugeInt sourceInitializing;
|
||||
|
||||
public MetricsReplicationSourceSourceImpl(MetricsReplicationSourceImpl rms, String id) {
|
||||
this.rms = rms;
|
||||
|
@ -131,6 +134,9 @@ public class MetricsReplicationSourceSourceImpl implements MetricsReplicationSou
|
|||
|
||||
oldestWalAgeKey = this.keyPrefix + "oldestWalAge";
|
||||
oldestWalAge = rms.getMetricsRegistry().getGauge(oldestWalAgeKey, 0L);
|
||||
|
||||
sourceInitializingKey = this.keyPrefix + "isInitializing";
|
||||
sourceInitializing = rms.getMetricsRegistry().getGaugeInt(sourceInitializingKey, 0);
|
||||
}
|
||||
|
||||
@Override public void setLastShippedAge(long age) {
|
||||
|
@ -197,6 +203,7 @@ public class MetricsReplicationSourceSourceImpl implements MetricsReplicationSou
|
|||
rms.removeMetric(completedLogsKey);
|
||||
rms.removeMetric(completedRecoveryKey);
|
||||
rms.removeMetric(oldestWalAgeKey);
|
||||
rms.removeMetric(sourceInitializingKey);
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -270,6 +277,20 @@ public class MetricsReplicationSourceSourceImpl implements MetricsReplicationSou
|
|||
return oldestWalAge.value();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void incrSourceInitializing() {
|
||||
sourceInitializing.incr(1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int getSourceInitializing() {
|
||||
return sourceInitializing.value();
|
||||
}
|
||||
|
||||
@Override public void decrSourceInitializing() {
|
||||
sourceInitializing.decr(1);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void init() {
|
||||
rms.init();
|
||||
|
|
|
@ -449,6 +449,40 @@ public class DynamicMetricsRegistry {
|
|||
return (MutableGaugeLong) metric;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a MetricMutableGaugeInt from the storage. If it is not there atomically put it.
|
||||
*
|
||||
* @param gaugeName name of the gauge to create or get.
|
||||
* @param potentialStartingValue value of the new gauge if we have to create it.
|
||||
*/
|
||||
public MutableGaugeInt getGaugeInt(String gaugeName, int potentialStartingValue) {
|
||||
//Try and get the guage.
|
||||
MutableMetric metric = metricsMap.get(gaugeName);
|
||||
|
||||
//If it's not there then try and put a new one in the storage.
|
||||
if (metric == null) {
|
||||
//Create the potential new gauge.
|
||||
MutableGaugeInt newGauge = new MutableGaugeInt(new MetricsInfoImpl(gaugeName, ""),
|
||||
potentialStartingValue);
|
||||
|
||||
// Try and put the gauge in. This is atomic.
|
||||
metric = metricsMap.putIfAbsent(gaugeName, newGauge);
|
||||
|
||||
//If the value we get back is null then the put was successful and we will return that.
|
||||
//otherwise gaugeInt should contain the thing that was in before the put could be completed.
|
||||
if (metric == null) {
|
||||
return newGauge;
|
||||
}
|
||||
}
|
||||
|
||||
if (!(metric instanceof MutableGaugeInt)) {
|
||||
throw new MetricsException("Metric already exists in registry for metric name: " + gaugeName +
|
||||
" and not of type MetricMutableGaugeInr");
|
||||
}
|
||||
|
||||
return (MutableGaugeInt) metric;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get a MetricMutableCounterLong from the storage. If it is not there atomically put it.
|
||||
*
|
||||
|
|
|
@ -22,10 +22,9 @@ import java.util.HashMap;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
|
||||
import org.apache.hadoop.hbase.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.hbase.CompatibilitySingletonFactory;
|
||||
import org.apache.hadoop.hbase.HBaseInterfaceAudience;
|
||||
import org.apache.hadoop.hbase.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.hbase.metrics.BaseSource;
|
||||
import org.apache.hadoop.hbase.metrics.MetricRegistryInfo;
|
||||
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
|
||||
|
@ -134,7 +133,8 @@ public class MetricsSource implements BaseSource {
|
|||
* @param tableName String as group and tableName
|
||||
*/
|
||||
public void setAgeOfLastShippedOpByTable(long timestamp, String tableName) {
|
||||
getSourceForTable(tableName).setLastShippedAge(EnvironmentEdgeManager.currentTime() - timestamp);
|
||||
getSourceForTable(tableName)
|
||||
.setLastShippedAge(EnvironmentEdgeManager.currentTime() - timestamp);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -184,6 +184,22 @@ public class MetricsSource implements BaseSource {
|
|||
globalSourceSource.decrSizeOfLogQueue(1);
|
||||
}
|
||||
|
||||
/**
|
||||
* Increment the count for initializing sources
|
||||
*/
|
||||
public void incrSourceInitializing() {
|
||||
singleSourceSource.incrSourceInitializing();
|
||||
globalSourceSource.incrSourceInitializing();
|
||||
}
|
||||
|
||||
/**
|
||||
* Decrement the count for initializing sources
|
||||
*/
|
||||
public void decrSourceInitializing() {
|
||||
singleSourceSource.decrSourceInitializing();
|
||||
globalSourceSource.decrSourceInitializing();
|
||||
}
|
||||
|
||||
/**
|
||||
* Add on the the number of log edits read
|
||||
*
|
||||
|
@ -290,6 +306,14 @@ public class MetricsSource implements BaseSource {
|
|||
return lastTimestamp;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the source initializing counts
|
||||
* @return number of replication sources getting initialized
|
||||
*/
|
||||
public int getSourceInitializing() {
|
||||
return singleSourceSource.getSourceInitializing();
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the slave peer ID
|
||||
* @return peerID
|
||||
|
|
|
@ -21,7 +21,6 @@ package org.apache.hadoop.hbase.replication.regionserver;
|
|||
import com.google.common.collect.Lists;
|
||||
import com.google.common.util.concurrent.ListenableFuture;
|
||||
import com.google.common.util.concurrent.Service;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
|
@ -34,7 +33,6 @@ import java.util.concurrent.ConcurrentHashMap;
|
|||
import java.util.concurrent.PriorityBlockingQueue;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
@ -119,6 +117,8 @@ public class ReplicationSource extends Thread implements ReplicationSourceInterf
|
|||
private int maxRetriesMultiplier;
|
||||
// Indicates if this particular source is running
|
||||
private volatile boolean sourceRunning = false;
|
||||
// Indicates if the source initialization is in progress
|
||||
private volatile boolean startupOngoing = false;
|
||||
// Metrics for this source
|
||||
private MetricsSource metrics;
|
||||
// ReplicationEndpoint which will handle the actual replication
|
||||
|
@ -266,16 +266,19 @@ public class ReplicationSource extends Thread implements ReplicationSourceInterf
|
|||
public void run() {
|
||||
// mark we are running now
|
||||
this.sourceRunning = true;
|
||||
this.setSourceStartupStatus(true);
|
||||
try {
|
||||
// start the endpoint, connect to the cluster
|
||||
Service.State state = replicationEndpoint.start().get();
|
||||
if (state != Service.State.RUNNING) {
|
||||
LOG.warn("ReplicationEndpoint was not started. Exiting");
|
||||
uninitialize();
|
||||
this.setSourceStartupStatus(false);
|
||||
return;
|
||||
}
|
||||
} catch (Exception ex) {
|
||||
LOG.warn("Error starting ReplicationEndpoint, exiting", ex);
|
||||
this.setSourceStartupStatus(false);
|
||||
throw new RuntimeException(ex);
|
||||
}
|
||||
|
||||
|
@ -300,6 +303,7 @@ public class ReplicationSource extends Thread implements ReplicationSourceInterf
|
|||
}
|
||||
|
||||
if (!this.isSourceActive()) {
|
||||
this.setSourceStartupStatus(false);
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -310,6 +314,7 @@ public class ReplicationSource extends Thread implements ReplicationSourceInterf
|
|||
+ peerClusterId + " which is not allowed by ReplicationEndpoint:"
|
||||
+ replicationEndpoint.getClass().getName(), null, false);
|
||||
this.manager.closeQueue(this);
|
||||
this.setSourceStartupStatus(false);
|
||||
return;
|
||||
}
|
||||
LOG.info("Replicating " + clusterId + " -> " + peerClusterId);
|
||||
|
@ -327,6 +332,16 @@ public class ReplicationSource extends Thread implements ReplicationSourceInterf
|
|||
worker.startup();
|
||||
}
|
||||
}
|
||||
this.setSourceStartupStatus(false);
|
||||
}
|
||||
|
||||
private synchronized void setSourceStartupStatus(boolean initializing) {
|
||||
startupOngoing = initializing;
|
||||
if (initializing) {
|
||||
metrics.incrSourceInitializing();
|
||||
} else {
|
||||
metrics.decrSourceInitializing();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -419,6 +419,22 @@ public class TestReplicationEndpoint extends TestReplicationBase {
|
|||
cells.get(0).getRowLength(), row, 0, row.length));
|
||||
}
|
||||
|
||||
/**
|
||||
* Bad Endpoint with failing connection to peer on demand.
|
||||
*/
|
||||
public static class BadReplicationEndpoint extends ReplicationEndpointForTest {
|
||||
static boolean failing = true;
|
||||
|
||||
public BadReplicationEndpoint() {
|
||||
super();
|
||||
}
|
||||
|
||||
@Override
|
||||
public synchronized UUID getPeerUUID() {
|
||||
return failing ? null : super.getPeerUUID();
|
||||
}
|
||||
}
|
||||
|
||||
public static class ReplicationEndpointForTest extends BaseReplicationEndpoint {
|
||||
static UUID uuid = UUID.randomUUID();
|
||||
static AtomicInteger contructedCount = new AtomicInteger();
|
||||
|
|
|
@ -863,7 +863,7 @@ public class TestReplicationSource {
|
|||
}
|
||||
}
|
||||
|
||||
/*
|
||||
/**
|
||||
Test age of oldest wal metric.
|
||||
*/
|
||||
@Test
|
||||
|
@ -898,6 +898,29 @@ public class TestReplicationSource {
|
|||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReplicationSourceInitializingMetric() throws Exception {
|
||||
String id = "1";
|
||||
MetricsSource metrics = Mockito.spy(new MetricsSource(id));
|
||||
Mocks mocks = new Mocks();
|
||||
ReplicationSource source = mocks.createReplicationSourceWithMocks(metrics,
|
||||
new TestReplicationEndpoint.BadReplicationEndpoint());
|
||||
source.startup();
|
||||
final MetricsReplicationSourceSource metricsSource1 = getSourceMetrics(id);
|
||||
Waiter.waitFor(conf, 20000, new Waiter.Predicate<Exception>() {
|
||||
@Override public boolean evaluate() {
|
||||
return metricsSource1.getSourceInitializing() == 1;
|
||||
}
|
||||
});
|
||||
TestReplicationEndpoint.BadReplicationEndpoint.failing = false;
|
||||
Waiter.waitFor(conf, 20000, new Waiter.Predicate<Exception>() {
|
||||
@Override public boolean evaluate() {
|
||||
return metricsSource1.getSourceInitializing() == 0;
|
||||
}
|
||||
});
|
||||
metrics.clear();
|
||||
}
|
||||
|
||||
private MetricsReplicationSourceSource getSourceMetrics(String sourceId) {
|
||||
MetricsReplicationSourceFactory factory = CompatibilitySingletonFactory
|
||||
.getInstance(MetricsReplicationSourceFactory.class);
|
||||
|
|
Loading…
Reference in New Issue