HBASE-25741: Deadlock during peer cleanup with NoNodeException (#3204)
Introduced due to commit from HBASE-25583. Fix is to issue the cleanup asynchronously. Signed-off-by: Bharath Vissapragada <bharathv@apache.org>
This commit is contained in:
parent
852b1b0826
commit
a0dd384691
|
@ -241,7 +241,7 @@ public class ReplicationSourceManager implements ReplicationListener {
|
||||||
if (peerId.contains("-")) {
|
if (peerId.contains("-")) {
|
||||||
peerId = peerId.split("-")[0];
|
peerId = peerId.split("-")[0];
|
||||||
}
|
}
|
||||||
peerRemoved(peerId);
|
schedulePeerRemoval(peerId);
|
||||||
}
|
}
|
||||||
walSet.clear();
|
walSet.clear();
|
||||||
}
|
}
|
||||||
|
@ -653,6 +653,23 @@ public class ReplicationSourceManager implements ReplicationListener {
|
||||||
transferQueues(regionserver);
|
transferQueues(regionserver);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* We want to run the peer removal in a separate thread when the peer removal
|
||||||
|
* is called from ReplicationSource shipper thread on encountering NoNodeException
|
||||||
|
* because peerRemoved terminate the source which might leave replication source
|
||||||
|
* in orphaned state.
|
||||||
|
* See HBASE-25741.
|
||||||
|
* @param peerId peer ID to be removed.
|
||||||
|
*/
|
||||||
|
private void schedulePeerRemoval(final String peerId) {
|
||||||
|
LOG.info(String.format("Scheduling an async peer removal for peer %s", peerId));
|
||||||
|
this.executor.submit(new Runnable() {
|
||||||
|
@Override public void run() {
|
||||||
|
peerRemoved(peerId);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void peerRemoved(String peerId) {
|
public void peerRemoved(String peerId) {
|
||||||
removePeer(peerId);
|
removePeer(peerId);
|
||||||
|
|
|
@ -259,8 +259,7 @@ public class TestReplicationSource {
|
||||||
KeyValue kv = new KeyValue(b, b, b);
|
KeyValue kv = new KeyValue(b, b, b);
|
||||||
WALEdit edit = new WALEdit();
|
WALEdit edit = new WALEdit();
|
||||||
edit.add(kv);
|
edit.add(kv);
|
||||||
WALKey key = new WALKey(b, TableName.valueOf(b), 0, 0,
|
WALKey key = new WALKey(b, TableName.valueOf(b), 0, 0, HConstants.DEFAULT_CLUSTER_ID);
|
||||||
HConstants.DEFAULT_CLUSTER_ID);
|
|
||||||
NavigableMap<byte[], Integer> scopes = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
|
NavigableMap<byte[], Integer> scopes = new TreeMap<byte[], Integer>(Bytes.BYTES_COMPARATOR);
|
||||||
scopes.put(b, HConstants.REPLICATION_SCOPE_GLOBAL);
|
scopes.put(b, HConstants.REPLICATION_SCOPE_GLOBAL);
|
||||||
key.setScopes(scopes);
|
key.setScopes(scopes);
|
||||||
|
@ -565,7 +564,13 @@ public class TestReplicationSource {
|
||||||
};
|
};
|
||||||
|
|
||||||
final ReplicationSource source = mocks.createReplicationSourceAndManagerWithMocks(endpoint);
|
final ReplicationSource source = mocks.createReplicationSourceAndManagerWithMocks(endpoint);
|
||||||
source.run();
|
source.startup();
|
||||||
|
// source thread should be active
|
||||||
|
Waiter.waitFor(conf, 20000, new Waiter.Predicate<Exception>() {
|
||||||
|
@Override public boolean evaluate() {
|
||||||
|
return source.isAlive();
|
||||||
|
}
|
||||||
|
});
|
||||||
source.enqueueLog(log1);
|
source.enqueueLog(log1);
|
||||||
|
|
||||||
// Wait for source to replicate
|
// Wait for source to replicate
|
||||||
|
@ -588,6 +593,14 @@ public class TestReplicationSource {
|
||||||
return !source.isSourceActive();
|
return !source.isSourceActive();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// And the source thread be terminated
|
||||||
|
Waiter.waitFor(conf, 20000, new Waiter.Predicate<Exception>() {
|
||||||
|
@Override public boolean evaluate() {
|
||||||
|
return !source.isAlive();
|
||||||
|
}
|
||||||
|
});
|
||||||
|
assertTrue("Source should be removed", mocks.manager.getSources().isEmpty());
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
|
|
|
@ -22,6 +22,7 @@ import java.net.URLEncoder;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import org.apache.hadoop.hbase.KeyValue;
|
import org.apache.hadoop.hbase.KeyValue;
|
||||||
|
import org.apache.hadoop.hbase.Waiter;
|
||||||
import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl;
|
import org.apache.hadoop.hbase.regionserver.MultiVersionConcurrencyControl;
|
||||||
import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener;
|
import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener;
|
||||||
import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
|
import org.apache.hadoop.hbase.regionserver.wal.WALEdit;
|
||||||
|
@ -78,19 +79,27 @@ public class TestReplicationSourceWithoutReplicationZnodes
|
||||||
wal.sync(txid);
|
wal.sync(txid);
|
||||||
|
|
||||||
wal.rollWriter();
|
wal.rollWriter();
|
||||||
ZKUtil.deleteNodeRecursively(zkw, "/hbase/replication/peers/1");
|
Waiter.waitFor(conf, 20000, new Waiter.Predicate<Exception>() {
|
||||||
ZKUtil.deleteNodeRecursively(zkw, "/hbase/replication/rs/" + server.getServerName() + "/1");
|
@Override public boolean evaluate() {
|
||||||
|
return !manager.getSources().isEmpty();
|
||||||
|
}
|
||||||
|
});
|
||||||
Assert.assertEquals("There should be exactly one source",
|
Assert.assertEquals("There should be exactly one source",
|
||||||
1, manager.getSources().size());
|
1, manager.getSources().size());
|
||||||
Assert.assertEquals("Replication source is not correct",
|
Assert.assertEquals("Replication source is not correct",
|
||||||
ReplicationSourceDummyWithNoTermination.class,
|
ReplicationSourceDummyWithNoTermination.class,
|
||||||
manager.getSources().get(0).getClass());
|
manager.getSources().get(0).getClass());
|
||||||
|
// delete the znodes for peer
|
||||||
|
ZKUtil.deleteNodeRecursively(zkw, "/hbase/replication/peers/1");
|
||||||
|
ZKUtil.deleteNodeRecursively(zkw, "/hbase/replication/rs/" + server.getServerName() + "/1");
|
||||||
manager
|
manager
|
||||||
.logPositionAndCleanOldLogs(manager.getSources().get(0).getCurrentPath(), "1", 0, false,
|
.logPositionAndCleanOldLogs(manager.getSources().get(0).getCurrentPath(), "1", 0, false,
|
||||||
false);
|
false);
|
||||||
Assert.assertTrue("Replication source should be terminated and removed",
|
Waiter.waitFor(conf, 20000, new Waiter.Predicate<Exception>() {
|
||||||
manager.getSources().isEmpty());
|
@Override public boolean evaluate() {
|
||||||
|
return manager.getSources().isEmpty();
|
||||||
|
}
|
||||||
|
});
|
||||||
} finally {
|
} finally {
|
||||||
conf.set("replication.replicationsource.implementation", replicationSourceImplName);
|
conf.set("replication.replicationsource.implementation", replicationSourceImplName);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue