HBASE-26482 HMaster may clean wals that is replicating in rare cases (#3887)

Signed-off-by: Duo Zhang <zhangduo@apache.org>
This commit is contained in:
zhengzhuobinzzb 2021-12-05 22:26:20 +08:00 committed by GitHub
parent c472329460
commit 8d96fc3614
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 68 additions and 1 deletions

View File

@ -19,6 +19,7 @@
package org.apache.hadoop.hbase.replication;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.zookeeper.KeeperException;
@ -67,6 +68,13 @@ public interface ReplicationQueuesClient {
*/
int getQueuesZNodeCversion() throws KeeperException;
/**
* Get a map of cversion of all replicator nodes. This can be used as optimistic locking
* to get a consistent snapshot of the replication queues.
* @return a map of replicator to cversion
*/
Map<String, Integer> getReplicatorsZNodeCversion() throws KeeperException;
/**
* Get the change version number of replication hfile references node. This can be used as
* optimistic locking to get a consistent snapshot of the replication queues of hfile references.

View File

@ -18,7 +18,9 @@
*/
package org.apache.hadoop.hbase.replication;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.hadoop.hbase.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
@ -92,6 +94,18 @@ public class ReplicationQueuesClientZKImpl extends ReplicationStateZKBase implem
}
}
@Override public Map<String, Integer> getReplicatorsZNodeCversion()
throws KeeperException {
List<String> rss = super.getListOfReplicatorsZK();
Map<String, Integer> rsToCversion = new HashMap<>();
for (String rs : rss) {
Stat stat = new Stat();
ZKUtil.getDataNoWatch(this.zookeeper, ZKUtil.joinZNode(this.queuesZNode, rs), stat);
rsToCversion.put(rs, stat.getCversion());
}
return rsToCversion;
}
@Override
public int getHFileRefsNodeChangeVersion() throws KeeperException {
Stat stat = new Stat();

View File

@ -106,6 +106,9 @@ public class ReplicationLogCleaner extends BaseLogCleanerDelegate {
LOG.debug("Didn't find any region server that replicates, won't prevent any deletions.");
return ImmutableSet.of();
}
// We should also check cversions of all rs nodes to Prevent missing of WAL which are claiming
// by other regionServer. For details, please see HBASE-26482
Map<String, Integer> rsToCversionBefore = replicationQueues.getReplicatorsZNodeCversion();
Set<String> wals = Sets.newHashSet();
for (String rs : rss) {
List<String> listOfPeers = replicationQueues.getAllQueues(rs);
@ -121,7 +124,8 @@ public class ReplicationLogCleaner extends BaseLogCleanerDelegate {
}
}
int v1 = replicationQueues.getQueuesZNodeCversion();
if (v0 == v1) {
Map<String, Integer> rsToCversionAfter = replicationQueues.getReplicatorsZNodeCversion();
if (v0 == v1 && rsToCversionBefore.equals(rsToCversionAfter)) {
return wals;
}
LOG.info(String.format("Replication queue node cversion changed from %d to %d, retry = %d",

View File

@ -21,6 +21,7 @@ import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.mockito.Mockito.atLeast;
import static org.mockito.Mockito.doNothing;
import static org.mockito.Mockito.doReturn;
import static org.mockito.Mockito.doThrow;
@ -41,6 +42,7 @@ import java.util.Map;
import java.util.Random;
import java.util.concurrent.atomic.AtomicBoolean;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
@ -188,6 +190,9 @@ public class TestLogsCleaner {
ReplicationQueuesClient rqcMock = mock(ReplicationQueuesClient.class);
Mockito.when(rqcMock.getQueuesZNodeCversion()).thenReturn(1, 2, 3, 4);
// Avoid direct return because there no replicator.
Mockito.when(rqcMock.getListOfReplicators())
.thenReturn(Lists.newArrayList("s1", "s2"));
Field rqc = ReplicationLogCleaner.class.getDeclaredField("replicationQueues");
rqc.setAccessible(true);
@ -196,6 +201,35 @@ public class TestLogsCleaner {
// This should return eventually when cversion stabilizes
cleaner.getDeletableFiles(new LinkedList<FileStatus>());
// Test did get an optimistic lock
Mockito.verify(rqcMock, atLeast(5)).getQueuesZNodeCversion();
}
@Test
public void testReplicatorZnodeCversionChange()
throws KeeperException, NoSuchFieldException, IllegalAccessException {
Configuration conf = TEST_UTIL.getConfiguration();
ReplicationLogCleaner cleaner = new ReplicationLogCleaner();
cleaner.setConf(conf);
ReplicationQueuesClient rqcMock = mock(ReplicationQueuesClient.class);
// Avoid direct return because there no replicator.
Mockito.when(rqcMock.getListOfReplicators()).thenReturn(Lists.newArrayList("s1", "s2"));
Mockito.when(rqcMock.getReplicatorsZNodeCversion()).thenReturn(
ImmutableMap.of("s1", 0, "s2", 0),
ImmutableMap.of("s1", 1, "s2", 1),
ImmutableMap.of("s1", 2, "s2", 2),
ImmutableMap.of("s1", 3, "s2", 3));
Field rqc = ReplicationLogCleaner.class.getDeclaredField("replicationQueues");
rqc.setAccessible(true);
rqc.set(cleaner, rqcMock);
// This should return eventually when cversion stabilizes
cleaner.getDeletableFiles(new LinkedList<FileStatus>());
// Test did get an optimistic lock
Mockito.verify(rqcMock, atLeast(5)).getReplicatorsZNodeCversion();
}
@Test(timeout=10000)

View File

@ -127,6 +127,13 @@ public class TestReplicationStateZKImpl extends TestReplicationStateBasic {
assertTrue(rqZK.isPeerPath(peerPath));
}
@Test
public void testZNodeCversion() throws ReplicationException, KeeperException {
rq1.init(server1);
assertTrue(rqc.getReplicatorsZNodeCversion().containsKey(server1));
}
static class DummyServer implements Server {
private String serverName;
private boolean isAborted = false;