HBASE-27212 Implement a new table based replication queue storage and make the minimum replication system work (#4672)
Signed-off-by: Xin Sun <ddupgs@gmail.com>
This commit is contained in:
parent
8b8c684bcc
commit
e2d8db6dfc
|
@ -679,16 +679,13 @@ message ClaimReplicationQueueRemoteStateData {
|
|||
required ServerName crashed_server = 1;
|
||||
required string queue = 2;
|
||||
required ServerName target_server = 3;
|
||||
optional ServerName source_server = 4;
|
||||
}
|
||||
|
||||
message ClaimReplicationQueueRemoteParameter {
|
||||
required ServerName crashed_server = 1;
|
||||
required string queue = 2;
|
||||
}
|
||||
|
||||
enum ClaimReplicationQueuesState {
|
||||
CLAIM_REPLICATION_QUEUES_DISPATCH = 1;
|
||||
CLAIM_REPLICATION_QUEUES_FINISH = 2;
|
||||
optional ServerName source_server = 3;
|
||||
}
|
||||
|
||||
enum ModifyTableDescriptorState {
|
||||
|
@ -715,3 +712,13 @@ message ModifyStoreFileTrackerStateData {
|
|||
message ModifyColumnFamilyStoreFileTrackerStateData {
|
||||
required bytes family = 1;
|
||||
}
|
||||
|
||||
enum AssignReplicationQueuesState {
|
||||
ASSIGN_REPLICATION_QUEUES_PRE_CHECK = 1;
|
||||
ASSIGN_REPLICATION_QUEUES_ADD_MISSING_QUEUES = 2;
|
||||
ASSIGN_REPLICATION_QUEUES_CLAIM = 3;
|
||||
}
|
||||
|
||||
message AssignReplicationQueuesStateData {
|
||||
required ServerName crashed_server = 1;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.replication;
|
||||
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
|
||||
@InterfaceAudience.Private
|
||||
public class ReplicationGroupOffset {
|
||||
|
||||
public static final ReplicationGroupOffset BEGIN = new ReplicationGroupOffset("", 0L);
|
||||
|
||||
private final String wal;
|
||||
|
||||
private final long offset;
|
||||
|
||||
public ReplicationGroupOffset(String wal, long offset) {
|
||||
this.wal = wal;
|
||||
this.offset = offset;
|
||||
}
|
||||
|
||||
public String getWal() {
|
||||
return wal;
|
||||
}
|
||||
|
||||
/**
|
||||
* A negative value means this file has already been fully replicated out
|
||||
*/
|
||||
public long getOffset() {
|
||||
return offset;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return wal + ":" + offset;
|
||||
}
|
||||
|
||||
public static ReplicationGroupOffset parse(String str) {
|
||||
int index = str.lastIndexOf(':');
|
||||
return new ReplicationGroupOffset(str.substring(0, index),
|
||||
Long.parseLong(str.substring(index + 1)));
|
||||
}
|
||||
}
|
|
@ -0,0 +1,47 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.replication;
|
||||
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap;
|
||||
|
||||
/**
|
||||
* Representing all the information for a replication queue.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
public class ReplicationQueueData {
|
||||
|
||||
private final ReplicationQueueId id;
|
||||
|
||||
private final ImmutableMap<String, ReplicationGroupOffset> offsets;
|
||||
|
||||
public ReplicationQueueData(ReplicationQueueId id,
|
||||
ImmutableMap<String, ReplicationGroupOffset> offsets) {
|
||||
this.id = id;
|
||||
this.offsets = offsets;
|
||||
}
|
||||
|
||||
public ReplicationQueueId getId() {
|
||||
return id;
|
||||
}
|
||||
|
||||
public ImmutableMap<String, ReplicationGroupOffset> getOffsets() {
|
||||
return offsets;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,141 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.replication;
|
||||
|
||||
import java.util.Objects;
|
||||
import java.util.Optional;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
|
||||
@InterfaceAudience.Private
|
||||
public class ReplicationQueueId {
|
||||
|
||||
private final ServerName serverName;
|
||||
|
||||
private final String peerId;
|
||||
|
||||
private final Optional<ServerName> sourceServerName;
|
||||
|
||||
// we do not allow '-' in peer names so it is safe to use it as the separator for peer id and
|
||||
// server name
|
||||
private static final char PEER_ID_SEPARATOR = '-';
|
||||
|
||||
// The '/' character is not valid for a hostname or a nodename(FQDN, so it is safe to use it as
|
||||
// the separator for server names)
|
||||
private static final char SERVER_NAME_SEPARATOR = '/';
|
||||
|
||||
public ReplicationQueueId(ServerName serverName, String peerId) {
|
||||
this.serverName = Objects.requireNonNull(serverName);
|
||||
this.peerId = Objects.requireNonNull(peerId);
|
||||
this.sourceServerName = Optional.empty();
|
||||
}
|
||||
|
||||
public ReplicationQueueId(ServerName serverName, String peerId, ServerName sourceServerName) {
|
||||
this.serverName = Objects.requireNonNull(serverName);
|
||||
this.peerId = Objects.requireNonNull(peerId);
|
||||
this.sourceServerName = Optional.of(sourceServerName);
|
||||
}
|
||||
|
||||
public ServerName getServerName() {
|
||||
return serverName;
|
||||
}
|
||||
|
||||
public String getPeerId() {
|
||||
return peerId;
|
||||
}
|
||||
|
||||
public Optional<ServerName> getSourceServerName() {
|
||||
return sourceServerName;
|
||||
}
|
||||
|
||||
public ServerName getServerWALsBelongTo() {
|
||||
return sourceServerName.orElse(serverName);
|
||||
}
|
||||
|
||||
public boolean isRecovered() {
|
||||
return sourceServerName.isPresent();
|
||||
}
|
||||
|
||||
public ReplicationQueueId claim(ServerName targetServerName) {
|
||||
ServerName newSourceServerName = sourceServerName.orElse(serverName);
|
||||
return new ReplicationQueueId(targetServerName, peerId, newSourceServerName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public int hashCode() {
|
||||
return Objects.hash(peerId, serverName, sourceServerName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean equals(Object obj) {
|
||||
if (this == obj) {
|
||||
return true;
|
||||
}
|
||||
if (!(obj instanceof ReplicationQueueId)) {
|
||||
return false;
|
||||
}
|
||||
ReplicationQueueId other = (ReplicationQueueId) obj;
|
||||
return Objects.equals(peerId, other.peerId) && Objects.equals(serverName, other.serverName)
|
||||
&& Objects.equals(sourceServerName, other.sourceServerName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
StringBuilder sb =
|
||||
new StringBuilder().append(peerId).append(PEER_ID_SEPARATOR).append(serverName);
|
||||
sourceServerName.ifPresent(s -> sb.append(SERVER_NAME_SEPARATOR).append(s.toString()));
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
public static ReplicationQueueId parse(String str) {
|
||||
int dashIndex = str.indexOf(PEER_ID_SEPARATOR);
|
||||
String peerId = str.substring(0, dashIndex);
|
||||
int slashIndex = str.indexOf(SERVER_NAME_SEPARATOR, dashIndex + 1);
|
||||
if (slashIndex < 0) {
|
||||
String serverName = str.substring(dashIndex + 1);
|
||||
return new ReplicationQueueId(ServerName.valueOf(serverName), peerId);
|
||||
} else {
|
||||
String serverName = str.substring(dashIndex + 1, slashIndex);
|
||||
String sourceServerName = str.substring(slashIndex + 1);
|
||||
return new ReplicationQueueId(ServerName.valueOf(serverName), peerId,
|
||||
ServerName.valueOf(sourceServerName));
|
||||
}
|
||||
}
|
||||
|
||||
public static String getPeerId(String str) {
|
||||
int dashIndex = str.indexOf(PEER_ID_SEPARATOR);
|
||||
return str.substring(0, dashIndex);
|
||||
}
|
||||
|
||||
public static byte[] getScanPrefix(ServerName serverName, String peerId) {
|
||||
return Bytes.toBytes(peerId + PEER_ID_SEPARATOR + serverName.toString());
|
||||
}
|
||||
|
||||
public static byte[] getScanPrefix(String peerId) {
|
||||
return Bytes.toBytes(peerId + PEER_ID_SEPARATOR);
|
||||
}
|
||||
|
||||
private static char getNextChar(char c) {
|
||||
return (char) ((int) c + 1);
|
||||
}
|
||||
|
||||
public static byte[] getScanStartRowForNextPeerId(String peerId) {
|
||||
return Bytes.toBytes(peerId + getNextChar(PEER_ID_SEPARATOR));
|
||||
}
|
||||
}
|
|
@ -20,7 +20,6 @@ package org.apache.hadoop.hbase.replication;
|
|||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.SortedSet;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.util.Pair;
|
||||
|
@ -33,40 +32,79 @@ import org.apache.yetus.audience.InterfaceAudience;
|
|||
public interface ReplicationQueueStorage {
|
||||
|
||||
/**
|
||||
* Remove a replication queue for a given regionserver.
|
||||
* @param serverName the name of the regionserver
|
||||
* @param queueId a String that identifies the queue.
|
||||
* Set the current offset for a specific WAL group in a given queue.
|
||||
* @param queueId the id of the queue
|
||||
* @param walGroup the group of the WAL, can be empty if multi wal is not enabled
|
||||
* @param offset the current offset of replication progress
|
||||
* @param lastSeqIds map with {encodedRegionName, sequenceId} pairs for serial replication.
|
||||
*/
|
||||
void removeQueue(ServerName serverName, String queueId) throws ReplicationException;
|
||||
void setOffset(ReplicationQueueId queueId, String walGroup, ReplicationGroupOffset offset,
|
||||
Map<String, Long> lastSeqIds) throws ReplicationException;
|
||||
|
||||
/**
|
||||
* Add a new WAL file to the given queue for a given regionserver. If the queue does not exist it
|
||||
* is created.
|
||||
* @param serverName the name of the regionserver
|
||||
* @param queueId a String that identifies the queue.
|
||||
* @param fileName name of the WAL
|
||||
* Get the current offset of all the WAL groups for a queue
|
||||
* @param queueId the id of the queue
|
||||
* @return a map of all offsets of the WAL groups. The key the is WAL group and the value is the
|
||||
* position.
|
||||
*/
|
||||
void addWAL(ServerName serverName, String queueId, String fileName) throws ReplicationException;
|
||||
|
||||
/**
|
||||
* Remove an WAL file from the given queue for a given regionserver.
|
||||
* @param serverName the name of the regionserver
|
||||
* @param queueId a String that identifies the queue.
|
||||
* @param fileName name of the WAL
|
||||
*/
|
||||
void removeWAL(ServerName serverName, String queueId, String fileName)
|
||||
Map<String, ReplicationGroupOffset> getOffsets(ReplicationQueueId queueId)
|
||||
throws ReplicationException;
|
||||
|
||||
/**
|
||||
* Set the current position for a specific WAL in a given queue for a given regionserver.
|
||||
* @param serverName the name of the regionserver
|
||||
* @param queueId a String that identifies the queue
|
||||
* @param fileName name of the WAL
|
||||
* @param position the current position in the file. Will ignore if less than or equal to 0.
|
||||
* @param lastSeqIds map with {encodedRegionName, sequenceId} pairs for serial replication.
|
||||
* Get a list of all queues for the specific peer.
|
||||
* @param peerId the id of the peer
|
||||
* @return a list of queueIds
|
||||
*/
|
||||
void setWALPosition(ServerName serverName, String queueId, String fileName, long position,
|
||||
Map<String, Long> lastSeqIds) throws ReplicationException;
|
||||
List<ReplicationQueueId> listAllQueueIds(String peerId) throws ReplicationException;
|
||||
|
||||
/**
|
||||
* Get a list of all queues for the specific region server.
|
||||
* @param serverName the server name of the region server that owns the set of queues
|
||||
* @return a list of queueIds
|
||||
*/
|
||||
List<ReplicationQueueId> listAllQueueIds(ServerName serverName) throws ReplicationException;
|
||||
|
||||
/**
|
||||
* Get a list of all queues for the specific region server and the specific peer
|
||||
* @param peerId the id of the peer
|
||||
* @param serverName the server name of the region server that owns the set of queues
|
||||
* @return a list of queueIds
|
||||
*/
|
||||
List<ReplicationQueueId> listAllQueueIds(String peerId, ServerName serverName)
|
||||
throws ReplicationException;
|
||||
|
||||
/**
|
||||
* Get a list of all queues and the offsets.
|
||||
*/
|
||||
List<ReplicationQueueData> listAllQueues() throws ReplicationException;
|
||||
|
||||
/**
|
||||
* Get a list of all region servers that have outstanding replication queues. These servers could
|
||||
* be alive, dead or from a previous run of the cluster.
|
||||
* @return a list of server names
|
||||
*/
|
||||
List<ServerName> listAllReplicators() throws ReplicationException;
|
||||
|
||||
/**
|
||||
* Change ownership for the queue identified by queueId and belongs to a dead region server.
|
||||
* @param queueId the id of the queue
|
||||
* @param targetServerName the name of the target region server
|
||||
* @return the new PeerId and A SortedSet of WALs in its queue
|
||||
*/
|
||||
Map<String, ReplicationGroupOffset> claimQueue(ReplicationQueueId queueId,
|
||||
ServerName targetServerName) throws ReplicationException;
|
||||
|
||||
/**
|
||||
* Remove a replication queue
|
||||
* @param queueId the id of the queue to remove
|
||||
*/
|
||||
void removeQueue(ReplicationQueueId queueId) throws ReplicationException;
|
||||
|
||||
/**
|
||||
* Remove all the replication queues for the given peer. Usually used when removing a peer.
|
||||
* @param peerId the id of the peer
|
||||
*/
|
||||
void removeAllQueues(String peerId) throws ReplicationException;
|
||||
|
||||
/**
|
||||
* Read the max sequence id of the specific region for a given peer. For serial replication, we
|
||||
|
@ -99,67 +137,6 @@ public interface ReplicationQueueStorage {
|
|||
void removeLastSequenceIds(String peerId, List<String> encodedRegionNames)
|
||||
throws ReplicationException;
|
||||
|
||||
/**
|
||||
* Get the current position for a specific WAL in a given queue for a given regionserver.
|
||||
* @param serverName the name of the regionserver
|
||||
* @param queueId a String that identifies the queue
|
||||
* @param fileName name of the WAL
|
||||
* @return the current position in the file
|
||||
*/
|
||||
long getWALPosition(ServerName serverName, String queueId, String fileName)
|
||||
throws ReplicationException;
|
||||
|
||||
/**
|
||||
* Get a list of all WALs in the given queue on the given region server.
|
||||
* @param serverName the server name of the region server that owns the queue
|
||||
* @param queueId a String that identifies the queue
|
||||
* @return a list of WALs
|
||||
*/
|
||||
List<String> getWALsInQueue(ServerName serverName, String queueId) throws ReplicationException;
|
||||
|
||||
/**
|
||||
* Get a list of all queues for the specified region server.
|
||||
* @param serverName the server name of the region server that owns the set of queues
|
||||
* @return a list of queueIds
|
||||
*/
|
||||
List<String> getAllQueues(ServerName serverName) throws ReplicationException;
|
||||
|
||||
/**
|
||||
* Change ownership for the queue identified by queueId and belongs to a dead region server.
|
||||
* @param sourceServerName the name of the dead region server
|
||||
* @param destServerName the name of the target region server
|
||||
* @param queueId the id of the queue
|
||||
* @return the new PeerId and A SortedSet of WALs in its queue
|
||||
*/
|
||||
Pair<String, SortedSet<String>> claimQueue(ServerName sourceServerName, String queueId,
|
||||
ServerName destServerName) throws ReplicationException;
|
||||
|
||||
/**
|
||||
* Remove the record of region server if the queue is empty.
|
||||
*/
|
||||
void removeReplicatorIfQueueIsEmpty(ServerName serverName) throws ReplicationException;
|
||||
|
||||
/**
|
||||
* Get a list of all region servers that have outstanding replication queues. These servers could
|
||||
* be alive, dead or from a previous run of the cluster.
|
||||
* @return a list of server names
|
||||
*/
|
||||
List<ServerName> getListOfReplicators() throws ReplicationException;
|
||||
|
||||
/**
|
||||
* Load all wals in all replication queues. This method guarantees to return a snapshot which
|
||||
* contains all WALs at the start of this call even there is concurrent queue failover. However,
|
||||
* some newly created WALs during the call may not be included.
|
||||
*/
|
||||
Set<String> getAllWALs() throws ReplicationException;
|
||||
|
||||
/**
|
||||
* Add a peer to hfile reference queue if peer does not exist.
|
||||
* @param peerId peer cluster id to be added
|
||||
* @throws ReplicationException if fails to add a peer id to hfile reference queue
|
||||
*/
|
||||
void addPeerToHFileRefs(String peerId) throws ReplicationException;
|
||||
|
||||
/**
|
||||
* Remove a peer from hfile reference queue.
|
||||
* @param peerId peer cluster id to be removed
|
||||
|
@ -201,11 +178,4 @@ public interface ReplicationQueueStorage {
|
|||
* created hfile references during the call may not be included.
|
||||
*/
|
||||
Set<String> getAllHFileRefs() throws ReplicationException;
|
||||
|
||||
/**
|
||||
* Get full znode name for given region server
|
||||
* @param serverName the name of the region server
|
||||
* @return full znode name
|
||||
*/
|
||||
String getRsNode(ServerName serverName);
|
||||
}
|
||||
|
|
|
@ -17,9 +17,18 @@
|
|||
*/
|
||||
package org.apache.hadoop.hbase.replication;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.lang.reflect.Constructor;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.hbase.Coprocessor;
|
||||
import org.apache.hadoop.hbase.NamespaceDescriptor;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
|
||||
import org.apache.hadoop.hbase.client.Connection;
|
||||
import org.apache.hadoop.hbase.client.CoprocessorDescriptorBuilder;
|
||||
import org.apache.hadoop.hbase.client.TableDescriptor;
|
||||
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
|
||||
import org.apache.hadoop.hbase.util.ReflectionUtils;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
|
@ -37,6 +46,27 @@ public final class ReplicationStorageFactory {
|
|||
public static final ReplicationPeerStorageType DEFAULT_REPLICATION_PEER_STORAGE_IMPL =
|
||||
ReplicationPeerStorageType.ZOOKEEPER;
|
||||
|
||||
public static final String REPLICATION_QUEUE_TABLE_NAME = "hbase.replication.queue.table.name";
|
||||
|
||||
public static final TableName REPLICATION_QUEUE_TABLE_NAME_DEFAULT =
|
||||
TableName.valueOf(NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR, "replication");
|
||||
|
||||
public static TableDescriptor createReplicationQueueTableDescriptor(TableName tableName)
|
||||
throws IOException {
|
||||
return TableDescriptorBuilder.newBuilder(tableName)
|
||||
.setColumnFamily(ColumnFamilyDescriptorBuilder.of(TableReplicationQueueStorage.QUEUE_FAMILY))
|
||||
.setColumnFamily(
|
||||
ColumnFamilyDescriptorBuilder.of(TableReplicationQueueStorage.LAST_SEQUENCE_ID_FAMILY))
|
||||
.setColumnFamily(
|
||||
ColumnFamilyDescriptorBuilder.of(TableReplicationQueueStorage.HFILE_REF_FAMILY))
|
||||
.setValue("hbase.regionserver.region.split_restriction.type", "DelimitedKeyPrefix")
|
||||
.setValue("hbase.regionserver.region.split_restriction.delimiter", "-")
|
||||
.setCoprocessor(CoprocessorDescriptorBuilder
|
||||
.newBuilder("org.apache.hadoop.hbase.coprocessor.MultiRowMutationEndpoint")
|
||||
.setPriority(Coprocessor.PRIORITY_SYSTEM).build())
|
||||
.build();
|
||||
}
|
||||
|
||||
private ReplicationStorageFactory() {
|
||||
}
|
||||
|
||||
|
@ -76,8 +106,17 @@ public final class ReplicationStorageFactory {
|
|||
/**
|
||||
* Create a new {@link ReplicationQueueStorage}.
|
||||
*/
|
||||
public static ReplicationQueueStorage getReplicationQueueStorage(ZKWatcher zk,
|
||||
public static ReplicationQueueStorage getReplicationQueueStorage(Connection conn,
|
||||
Configuration conf) {
|
||||
return new ZKReplicationQueueStorage(zk, conf);
|
||||
return getReplicationQueueStorage(conn, TableName.valueOf(conf.get(REPLICATION_QUEUE_TABLE_NAME,
|
||||
REPLICATION_QUEUE_TABLE_NAME_DEFAULT.getNameAsString())));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a new {@link ReplicationQueueStorage}.
|
||||
*/
|
||||
public static ReplicationQueueStorage getReplicationQueueStorage(Connection conn,
|
||||
TableName tableName) {
|
||||
return new TableReplicationQueueStorage(conn, tableName);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -28,7 +28,6 @@ import org.apache.hadoop.fs.Path;
|
|||
import org.apache.hadoop.hbase.CompoundConfiguration;
|
||||
import org.apache.hadoop.hbase.HBaseConfiguration;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -81,20 +80,6 @@ public final class ReplicationUtils {
|
|||
return otherConf;
|
||||
}
|
||||
|
||||
public static void removeAllQueues(ReplicationQueueStorage queueStorage, String peerId)
|
||||
throws ReplicationException {
|
||||
for (ServerName replicator : queueStorage.getListOfReplicators()) {
|
||||
List<String> queueIds = queueStorage.getAllQueues(replicator);
|
||||
for (String queueId : queueIds) {
|
||||
ReplicationQueueInfo queueInfo = new ReplicationQueueInfo(queueId);
|
||||
if (queueInfo.getPeerId().equals(peerId)) {
|
||||
queueStorage.removeQueue(replicator, queueId);
|
||||
}
|
||||
}
|
||||
queueStorage.removeReplicatorIfQueueIsEmpty(replicator);
|
||||
}
|
||||
}
|
||||
|
||||
private static boolean isCollectionEqual(Collection<String> c1, Collection<String> c2) {
|
||||
if (c1 == null) {
|
||||
return c2 == null;
|
||||
|
|
|
@ -0,0 +1,535 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.replication;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NavigableMap;
|
||||
import java.util.Set;
|
||||
import java.util.function.Supplier;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.Cell;
|
||||
import org.apache.hadoop.hbase.CellScanner;
|
||||
import org.apache.hadoop.hbase.CellUtil;
|
||||
import org.apache.hadoop.hbase.CompareOperator;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.AsyncTable;
|
||||
import org.apache.hadoop.hbase.client.Connection;
|
||||
import org.apache.hadoop.hbase.client.Delete;
|
||||
import org.apache.hadoop.hbase.client.Get;
|
||||
import org.apache.hadoop.hbase.client.Put;
|
||||
import org.apache.hadoop.hbase.client.Result;
|
||||
import org.apache.hadoop.hbase.client.ResultScanner;
|
||||
import org.apache.hadoop.hbase.client.Scan;
|
||||
import org.apache.hadoop.hbase.client.Scan.ReadType;
|
||||
import org.apache.hadoop.hbase.client.Table;
|
||||
import org.apache.hadoop.hbase.filter.KeyOnlyFilter;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.util.FutureUtils;
|
||||
import org.apache.hadoop.hbase.util.Pair;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap;
|
||||
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.ClientProtos.MutationProto.MutationType;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MultiRowMutationProtos;
|
||||
|
||||
/**
|
||||
* HBase table based replication queue storage.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
public class TableReplicationQueueStorage implements ReplicationQueueStorage {
|
||||
|
||||
public static final byte[] QUEUE_FAMILY = Bytes.toBytes("queue");
|
||||
|
||||
public static final byte[] LAST_SEQUENCE_ID_FAMILY = Bytes.toBytes("sid");
|
||||
|
||||
public static final byte[] HFILE_REF_FAMILY = Bytes.toBytes("hfileref");
|
||||
|
||||
private final Connection conn;
|
||||
|
||||
private final TableName tableName;
|
||||
|
||||
@FunctionalInterface
|
||||
private interface TableCreator {
|
||||
|
||||
void create() throws IOException;
|
||||
}
|
||||
|
||||
public TableReplicationQueueStorage(Connection conn, TableName tableName) {
|
||||
this.conn = conn;
|
||||
this.tableName = tableName;
|
||||
}
|
||||
|
||||
private void addLastSeqIdsPut(MultiRowMutationProtos.MutateRowsRequest.Builder builder,
|
||||
String peerId, Map<String, Long> lastSeqIds, AsyncTable<?> table) throws IOException {
|
||||
// get the previous sequence ids first
|
||||
byte[] row = Bytes.toBytes(peerId);
|
||||
Get get = new Get(row);
|
||||
lastSeqIds.keySet().forEach(encodedRegionName -> get.addColumn(LAST_SEQUENCE_ID_FAMILY,
|
||||
Bytes.toBytes(encodedRegionName)));
|
||||
Result result = FutureUtils.get(table.get(get));
|
||||
Put put = new Put(row);
|
||||
for (Map.Entry<String, Long> entry : lastSeqIds.entrySet()) {
|
||||
String encodedRegionName = entry.getKey();
|
||||
long lastSeqId = entry.getValue();
|
||||
byte[] encodedRegionNameAsBytes = Bytes.toBytes(encodedRegionName);
|
||||
byte[] previousLastSeqIdAsBytes =
|
||||
result.getValue(LAST_SEQUENCE_ID_FAMILY, encodedRegionNameAsBytes);
|
||||
if (previousLastSeqIdAsBytes != null) {
|
||||
long previousLastSeqId = Bytes.toLong(previousLastSeqIdAsBytes);
|
||||
if (lastSeqId > previousLastSeqId) {
|
||||
// update last seq id when it is greater, and use CAS to make sure we do not overwrite
|
||||
// other's value.
|
||||
put.addColumn(LAST_SEQUENCE_ID_FAMILY, encodedRegionNameAsBytes,
|
||||
Bytes.toBytes(lastSeqId));
|
||||
builder.addCondition(ProtobufUtil.toCondition(row, LAST_SEQUENCE_ID_FAMILY,
|
||||
encodedRegionNameAsBytes, CompareOperator.EQUAL, previousLastSeqIdAsBytes, null));
|
||||
}
|
||||
} else {
|
||||
// also update last seq id when there is no value yet, and use CAS to make sure we do not
|
||||
// overwrite
|
||||
// other's value.
|
||||
put.addColumn(LAST_SEQUENCE_ID_FAMILY, encodedRegionNameAsBytes, Bytes.toBytes(lastSeqId));
|
||||
builder.addCondition(ProtobufUtil.toCondition(row, LAST_SEQUENCE_ID_FAMILY,
|
||||
encodedRegionNameAsBytes, CompareOperator.EQUAL, null, null));
|
||||
}
|
||||
}
|
||||
if (!put.isEmpty()) {
|
||||
builder.addMutationRequest(ProtobufUtil.toMutation(MutationType.PUT, put));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setOffset(ReplicationQueueId queueId, String walGroup, ReplicationGroupOffset offset,
|
||||
Map<String, Long> lastSeqIds) throws ReplicationException {
|
||||
Put put = new Put(Bytes.toBytes(queueId.toString())).addColumn(QUEUE_FAMILY,
|
||||
Bytes.toBytes(walGroup), Bytes.toBytes(offset.toString()));
|
||||
AsyncTable<?> asyncTable = conn.toAsyncConnection().getTable(tableName);
|
||||
try {
|
||||
if (lastSeqIds.isEmpty()) {
|
||||
FutureUtils.get(asyncTable.put(put));
|
||||
} else {
|
||||
for (;;) {
|
||||
MultiRowMutationProtos.MutateRowsRequest.Builder builder =
|
||||
MultiRowMutationProtos.MutateRowsRequest.newBuilder();
|
||||
addLastSeqIdsPut(builder, queueId.getPeerId(), lastSeqIds, asyncTable);
|
||||
if (builder.getMutationRequestCount() > 0) {
|
||||
// use MultiRowMutationService to atomically update offset and last sequence ids
|
||||
MultiRowMutationProtos.MutateRowsRequest request =
|
||||
builder.addMutationRequest(ProtobufUtil.toMutation(MutationType.PUT, put)).build();
|
||||
MultiRowMutationProtos.MutateRowsResponse responose =
|
||||
FutureUtils.get(asyncTable.<MultiRowMutationProtos.MultiRowMutationService.Interface,
|
||||
MultiRowMutationProtos.MutateRowsResponse> coprocessorService(
|
||||
MultiRowMutationProtos.MultiRowMutationService::newStub,
|
||||
(stub, controller, done) -> stub.mutateRows(controller, request, done),
|
||||
put.getRow()));
|
||||
if (responose.getProcessed()) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
// we do not need to update last seq id, fallback to single put
|
||||
FutureUtils.get(asyncTable.put(put));
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new ReplicationException("failed to setOffset, queueId=" + queueId + ", walGroup="
|
||||
+ walGroup + ", offset=" + offset + ", lastSeqIds=" + lastSeqIds, e);
|
||||
}
|
||||
}
|
||||
|
||||
private ImmutableMap<String, ReplicationGroupOffset> parseOffsets(Result result) {
|
||||
ImmutableMap.Builder<String, ReplicationGroupOffset> builder =
|
||||
ImmutableMap.builderWithExpectedSize(result.size());
|
||||
NavigableMap<byte[], byte[]> map = result.getFamilyMap(QUEUE_FAMILY);
|
||||
if (map != null) {
|
||||
map.forEach((k, v) -> {
|
||||
String walGroup = Bytes.toString(k);
|
||||
ReplicationGroupOffset offset = ReplicationGroupOffset.parse(Bytes.toString(v));
|
||||
builder.put(walGroup, offset);
|
||||
});
|
||||
}
|
||||
return builder.build();
|
||||
}
|
||||
|
||||
private Map<String, ReplicationGroupOffset> getOffsets0(Table table, ReplicationQueueId queueId)
|
||||
throws IOException {
|
||||
Result result = table.get(new Get(Bytes.toBytes(queueId.toString())).addFamily(QUEUE_FAMILY));
|
||||
return parseOffsets(result);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, ReplicationGroupOffset> getOffsets(ReplicationQueueId queueId)
|
||||
throws ReplicationException {
|
||||
try (Table table = conn.getTable(tableName)) {
|
||||
return getOffsets0(table, queueId);
|
||||
} catch (IOException e) {
|
||||
throw new ReplicationException("failed to getOffsets, queueId=" + queueId, e);
|
||||
}
|
||||
}
|
||||
|
||||
private void listAllQueueIds(Table table, Scan scan, List<ReplicationQueueId> queueIds)
|
||||
throws IOException {
|
||||
try (ResultScanner scanner = table.getScanner(scan)) {
|
||||
for (;;) {
|
||||
Result result = scanner.next();
|
||||
if (result == null) {
|
||||
break;
|
||||
}
|
||||
ReplicationQueueId queueId = ReplicationQueueId.parse(Bytes.toString(result.getRow()));
|
||||
queueIds.add(queueId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private void listAllQueueIds(Table table, String peerId, ServerName serverName,
|
||||
List<ReplicationQueueId> queueIds) throws IOException {
|
||||
listAllQueueIds(table,
|
||||
new Scan().setStartStopRowForPrefixScan(ReplicationQueueId.getScanPrefix(serverName, peerId))
|
||||
.addFamily(QUEUE_FAMILY).setFilter(new KeyOnlyFilter()),
|
||||
queueIds);
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ReplicationQueueId> listAllQueueIds(String peerId) throws ReplicationException {
|
||||
Scan scan = new Scan().setStartStopRowForPrefixScan(ReplicationQueueId.getScanPrefix(peerId))
|
||||
.addFamily(QUEUE_FAMILY).setFilter(new KeyOnlyFilter());
|
||||
List<ReplicationQueueId> queueIds = new ArrayList<>();
|
||||
try (Table table = conn.getTable(tableName)) {
|
||||
listAllQueueIds(table, scan, queueIds);
|
||||
} catch (IOException e) {
|
||||
throw new ReplicationException("failed to listAllQueueIds, peerId=" + peerId, e);
|
||||
}
|
||||
return queueIds;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ReplicationQueueId> listAllQueueIds(ServerName serverName)
|
||||
throws ReplicationException {
|
||||
List<ReplicationQueueId> queueIds = new ArrayList<>();
|
||||
try (Table table = conn.getTable(tableName)) {
|
||||
KeyOnlyFilter keyOnlyFilter = new KeyOnlyFilter();
|
||||
String previousPeerId = null;
|
||||
for (;;) {
|
||||
// first, get the next peerId
|
||||
Scan peerScan =
|
||||
new Scan().addFamily(QUEUE_FAMILY).setOneRowLimit().setFilter(keyOnlyFilter);
|
||||
if (previousPeerId != null) {
|
||||
peerScan.withStartRow(ReplicationQueueId.getScanStartRowForNextPeerId(previousPeerId));
|
||||
}
|
||||
String peerId;
|
||||
try (ResultScanner scanner = table.getScanner(peerScan)) {
|
||||
Result result = scanner.next();
|
||||
if (result == null) {
|
||||
// no more peers, break
|
||||
break;
|
||||
}
|
||||
peerId = ReplicationQueueId.getPeerId(Bytes.toString(result.getRow()));
|
||||
}
|
||||
listAllQueueIds(table, peerId, serverName, queueIds);
|
||||
previousPeerId = peerId;
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new ReplicationException("failed to listAllQueueIds, serverName=" + serverName, e);
|
||||
}
|
||||
return queueIds;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ReplicationQueueId> listAllQueueIds(String peerId, ServerName serverName)
|
||||
throws ReplicationException {
|
||||
List<ReplicationQueueId> queueIds = new ArrayList<>();
|
||||
try (Table table = conn.getTable(tableName)) {
|
||||
listAllQueueIds(table, peerId, serverName, queueIds);
|
||||
} catch (IOException e) {
|
||||
throw new ReplicationException(
|
||||
"failed to listAllQueueIds, peerId=" + peerId + ", serverName=" + serverName, e);
|
||||
}
|
||||
return queueIds;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ReplicationQueueData> listAllQueues() throws ReplicationException {
|
||||
List<ReplicationQueueData> queues = new ArrayList<>();
|
||||
Scan scan = new Scan().addFamily(QUEUE_FAMILY).setReadType(ReadType.STREAM);
|
||||
try (Table table = conn.getTable(tableName); ResultScanner scanner = table.getScanner(scan)) {
|
||||
for (;;) {
|
||||
Result result = scanner.next();
|
||||
if (result == null) {
|
||||
break;
|
||||
}
|
||||
ReplicationQueueId queueId = ReplicationQueueId.parse(Bytes.toString(result.getRow()));
|
||||
ReplicationQueueData queueData = new ReplicationQueueData(queueId, parseOffsets(result));
|
||||
queues.add(queueData);
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new ReplicationException("failed to listAllQueues", e);
|
||||
}
|
||||
return queues;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ServerName> listAllReplicators() throws ReplicationException {
|
||||
Set<ServerName> replicators = new HashSet<>();
|
||||
Scan scan = new Scan().addFamily(QUEUE_FAMILY).setFilter(new KeyOnlyFilter())
|
||||
.setReadType(ReadType.STREAM);
|
||||
try (Table table = conn.getTable(tableName); ResultScanner scanner = table.getScanner(scan)) {
|
||||
for (;;) {
|
||||
Result result = scanner.next();
|
||||
if (result == null) {
|
||||
break;
|
||||
}
|
||||
ReplicationQueueId queueId = ReplicationQueueId.parse(Bytes.toString(result.getRow()));
|
||||
replicators.add(queueId.getServerName());
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new ReplicationException("failed to listAllReplicators", e);
|
||||
}
|
||||
return new ArrayList<>(replicators);
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, ReplicationGroupOffset> claimQueue(ReplicationQueueId queueId,
|
||||
ServerName targetServerName) throws ReplicationException {
|
||||
ReplicationQueueId newQueueId = queueId.claim(targetServerName);
|
||||
byte[] coprocessorRow = ReplicationQueueId.getScanPrefix(queueId.getPeerId());
|
||||
AsyncTable<?> asyncTable = conn.toAsyncConnection().getTable(tableName);
|
||||
try (Table table = conn.getTable(tableName)) {
|
||||
for (;;) {
|
||||
Map<String, ReplicationGroupOffset> offsets = getOffsets0(table, queueId);
|
||||
if (offsets.isEmpty()) {
|
||||
return Collections.emptyMap();
|
||||
}
|
||||
Map.Entry<String, ReplicationGroupOffset> entry = offsets.entrySet().iterator().next();
|
||||
ClientProtos.Condition condition = ProtobufUtil.toCondition(
|
||||
Bytes.toBytes(queueId.toString()), QUEUE_FAMILY, Bytes.toBytes(entry.getKey()),
|
||||
CompareOperator.EQUAL, Bytes.toBytes(entry.getValue().toString()), null);
|
||||
Delete delete = new Delete(Bytes.toBytes(queueId.toString())).addFamily(QUEUE_FAMILY);
|
||||
Put put = new Put(Bytes.toBytes(newQueueId.toString()));
|
||||
offsets.forEach((walGroup, offset) -> put.addColumn(QUEUE_FAMILY, Bytes.toBytes(walGroup),
|
||||
Bytes.toBytes(offset.toString())));
|
||||
MultiRowMutationProtos.MutateRowsRequest request =
|
||||
MultiRowMutationProtos.MutateRowsRequest.newBuilder().addCondition(condition)
|
||||
.addMutationRequest(ProtobufUtil.toMutation(MutationType.DELETE, delete))
|
||||
.addMutationRequest(ProtobufUtil.toMutation(MutationType.PUT, put)).build();
|
||||
MultiRowMutationProtos.MutateRowsResponse resp =
|
||||
FutureUtils.get(asyncTable.<MultiRowMutationProtos.MultiRowMutationService.Interface,
|
||||
MultiRowMutationProtos.MutateRowsResponse> coprocessorService(
|
||||
MultiRowMutationProtos.MultiRowMutationService::newStub,
|
||||
(stub, controller, done) -> stub.mutateRows(controller, request, done),
|
||||
coprocessorRow));
|
||||
if (resp.getProcessed()) {
|
||||
return offsets;
|
||||
}
|
||||
// if the multi is not processed, which usually the queue has already been claimed by
|
||||
// others, for safety, let's try claiming again, usually the next get operation above will
|
||||
// return an empty map and we will quit the loop.
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new ReplicationException(
|
||||
"failed to claimQueue, queueId=" + queueId + ", targetServerName=" + targetServerName, e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void removeQueue(ReplicationQueueId queueId) throws ReplicationException {
|
||||
try (Table table = conn.getTable(tableName)) {
|
||||
table.delete(new Delete(Bytes.toBytes(queueId.toString())).addFamily(QUEUE_FAMILY));
|
||||
} catch (IOException e) {
|
||||
throw new ReplicationException("failed to removeQueue, queueId=" + queueId, e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void removeAllQueues(String peerId) throws ReplicationException {
|
||||
Scan scan = new Scan().setStartStopRowForPrefixScan(ReplicationQueueId.getScanPrefix(peerId))
|
||||
.addFamily(QUEUE_FAMILY).setFilter(new KeyOnlyFilter());
|
||||
try (Table table = conn.getTable(tableName); ResultScanner scanner = table.getScanner(scan)) {
|
||||
for (;;) {
|
||||
Result result = scanner.next();
|
||||
if (result == null) {
|
||||
break;
|
||||
}
|
||||
table.delete(new Delete(result.getRow()));
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new ReplicationException("failed to listAllQueueIds, peerId=" + peerId, e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getLastSequenceId(String encodedRegionName, String peerId)
|
||||
throws ReplicationException {
|
||||
byte[] qual = Bytes.toBytes(encodedRegionName);
|
||||
try (Table table = conn.getTable(tableName)) {
|
||||
Result result =
|
||||
table.get(new Get(Bytes.toBytes(peerId)).addColumn(LAST_SEQUENCE_ID_FAMILY, qual));
|
||||
byte[] lastSeqId = result.getValue(LAST_SEQUENCE_ID_FAMILY, qual);
|
||||
return lastSeqId != null ? Bytes.toLong(lastSeqId) : HConstants.NO_SEQNUM;
|
||||
} catch (IOException e) {
|
||||
throw new ReplicationException("failed to getLastSequenceId, encodedRegionName="
|
||||
+ encodedRegionName + ", peerId=" + peerId, e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setLastSequenceIds(String peerId, Map<String, Long> lastSeqIds)
|
||||
throws ReplicationException {
|
||||
// No need CAS and retry here, because it'll call setLastSequenceIds() for disabled peers
|
||||
// only, so no conflict happen.
|
||||
Put put = new Put(Bytes.toBytes(peerId));
|
||||
lastSeqIds.forEach((encodedRegionName, lastSeqId) -> put.addColumn(LAST_SEQUENCE_ID_FAMILY,
|
||||
Bytes.toBytes(encodedRegionName), Bytes.toBytes(lastSeqId)));
|
||||
try (Table table = conn.getTable(tableName)) {
|
||||
table.put(put);
|
||||
} catch (IOException e) {
|
||||
throw new ReplicationException(
|
||||
"failed to setLastSequenceIds, peerId=" + peerId + ", lastSeqIds=" + lastSeqIds, e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void removeLastSequenceIds(String peerId) throws ReplicationException {
|
||||
Delete delete = new Delete(Bytes.toBytes(peerId)).addFamily(LAST_SEQUENCE_ID_FAMILY);
|
||||
try (Table table = conn.getTable(tableName)) {
|
||||
table.delete(delete);
|
||||
} catch (IOException e) {
|
||||
throw new ReplicationException("failed to removeLastSequenceIds, peerId=" + peerId, e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void removeLastSequenceIds(String peerId, List<String> encodedRegionNames)
|
||||
throws ReplicationException {
|
||||
Delete delete = new Delete(Bytes.toBytes(peerId));
|
||||
encodedRegionNames.forEach(n -> delete.addColumns(LAST_SEQUENCE_ID_FAMILY, Bytes.toBytes(n)));
|
||||
try (Table table = conn.getTable(tableName)) {
|
||||
table.delete(delete);
|
||||
} catch (IOException e) {
|
||||
throw new ReplicationException("failed to removeLastSequenceIds, peerId=" + peerId
|
||||
+ ", encodedRegionNames=" + encodedRegionNames, e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void removePeerFromHFileRefs(String peerId) throws ReplicationException {
|
||||
try (Table table = conn.getTable(tableName)) {
|
||||
table.delete(new Delete(Bytes.toBytes(peerId)).addFamily(HFILE_REF_FAMILY));
|
||||
} catch (IOException e) {
|
||||
throw new ReplicationException("failed to removePeerFromHFileRefs, peerId=" + peerId, e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addHFileRefs(String peerId, List<Pair<Path, Path>> pairs)
|
||||
throws ReplicationException {
|
||||
Put put = new Put(Bytes.toBytes(peerId));
|
||||
pairs.forEach(p -> put.addColumn(HFILE_REF_FAMILY, Bytes.toBytes(p.getSecond().getName()),
|
||||
HConstants.EMPTY_BYTE_ARRAY));
|
||||
try (Table table = conn.getTable(tableName)) {
|
||||
table.put(put);
|
||||
} catch (IOException e) {
|
||||
throw new ReplicationException(
|
||||
"failed to addHFileRefs, peerId=" + peerId + ", pairs=" + pairs, e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void removeHFileRefs(String peerId, List<String> files) throws ReplicationException {
|
||||
Delete delete = new Delete(Bytes.toBytes(peerId));
|
||||
files.forEach(f -> delete.addColumns(HFILE_REF_FAMILY, Bytes.toBytes(f)));
|
||||
try (Table table = conn.getTable(tableName)) {
|
||||
table.delete(delete);
|
||||
} catch (IOException e) {
|
||||
throw new ReplicationException(
|
||||
"failed to removeHFileRefs, peerId=" + peerId + ", files=" + files, e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getAllPeersFromHFileRefsQueue() throws ReplicationException {
|
||||
List<String> peerIds = new ArrayList<>();
|
||||
Scan scan = new Scan().addFamily(HFILE_REF_FAMILY).setReadType(ReadType.STREAM)
|
||||
.setFilter(new KeyOnlyFilter());
|
||||
try (Table table = conn.getTable(tableName); ResultScanner scanner = table.getScanner(scan)) {
|
||||
for (;;) {
|
||||
Result result = scanner.next();
|
||||
if (result == null) {
|
||||
break;
|
||||
}
|
||||
peerIds.add(Bytes.toString(result.getRow()));
|
||||
}
|
||||
} catch (IOException e) {
|
||||
throw new ReplicationException("failed to getAllPeersFromHFileRefsQueue", e);
|
||||
}
|
||||
return peerIds;
|
||||
}
|
||||
|
||||
private <T extends Collection<String>> T scanHFiles(Scan scan, Supplier<T> creator)
|
||||
throws IOException {
|
||||
T files = creator.get();
|
||||
try (Table table = conn.getTable(tableName); ResultScanner scanner = table.getScanner(scan)) {
|
||||
for (;;) {
|
||||
Result result = scanner.next();
|
||||
if (result == null) {
|
||||
break;
|
||||
}
|
||||
CellScanner cellScanner = result.cellScanner();
|
||||
while (cellScanner.advance()) {
|
||||
Cell cell = cellScanner.current();
|
||||
files.add(Bytes.toString(CellUtil.cloneQualifier(cell)));
|
||||
}
|
||||
}
|
||||
}
|
||||
return files;
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getReplicableHFiles(String peerId) throws ReplicationException {
|
||||
// use scan to avoid getting a too large row one time, which may cause a very huge memory usage.
|
||||
Scan scan = new Scan().addFamily(HFILE_REF_FAMILY)
|
||||
.setStartStopRowForPrefixScan(Bytes.toBytes(peerId)).setAllowPartialResults(true);
|
||||
try {
|
||||
return scanHFiles(scan, ArrayList::new);
|
||||
} catch (IOException e) {
|
||||
throw new ReplicationException("failed to getReplicableHFiles, peerId=" + peerId, e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<String> getAllHFileRefs() throws ReplicationException {
|
||||
Scan scan = new Scan().addFamily(HFILE_REF_FAMILY).setReadType(ReadType.STREAM)
|
||||
.setAllowPartialResults(true);
|
||||
try {
|
||||
return scanHFiles(scan, HashSet::new);
|
||||
} catch (IOException e) {
|
||||
throw new ReplicationException("failed to getAllHFileRefs", e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -1,689 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.replication;
|
||||
|
||||
import static java.util.stream.Collectors.toList;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Map.Entry;
|
||||
import java.util.Set;
|
||||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.exceptions.DeserializationException;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.util.Pair;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKUtil.ZKUtilOp;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZNodePaths;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
import org.apache.zookeeper.KeeperException.BadVersionException;
|
||||
import org.apache.zookeeper.KeeperException.NoNodeException;
|
||||
import org.apache.zookeeper.KeeperException.NodeExistsException;
|
||||
import org.apache.zookeeper.KeeperException.NotEmptyException;
|
||||
import org.apache.zookeeper.data.Stat;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hbase.thirdparty.org.apache.commons.collections4.CollectionUtils;
|
||||
|
||||
/**
|
||||
* ZK based replication queue storage.
|
||||
* <p>
|
||||
* The base znode for each regionserver is the regionserver name. For example:
|
||||
*
|
||||
* <pre>
|
||||
* /hbase/replication/rs/hostname.example.org,6020,1234
|
||||
* </pre>
|
||||
*
|
||||
* Within this znode, the region server maintains a set of WAL replication queues. These queues are
|
||||
* represented by child znodes named using there give queue id. For example:
|
||||
*
|
||||
* <pre>
|
||||
* /hbase/replication/rs/hostname.example.org,6020,1234/1
|
||||
* /hbase/replication/rs/hostname.example.org,6020,1234/2
|
||||
* </pre>
|
||||
*
|
||||
* Each queue has one child znode for every WAL that still needs to be replicated. The value of
|
||||
* these WAL child znodes is the latest position that has been replicated. This position is updated
|
||||
* every time a WAL entry is replicated. For example:
|
||||
*
|
||||
* <pre>
|
||||
* /hbase/replication/rs/hostname.example.org,6020,1234/1/23522342.23422 [VALUE: 254]
|
||||
* </pre>
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
class ZKReplicationQueueStorage extends ZKReplicationStorageBase
|
||||
implements ReplicationQueueStorage {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(ZKReplicationQueueStorage.class);
|
||||
|
||||
public static final String ZOOKEEPER_ZNODE_REPLICATION_HFILE_REFS_KEY =
|
||||
"zookeeper.znode.replication.hfile.refs";
|
||||
public static final String ZOOKEEPER_ZNODE_REPLICATION_HFILE_REFS_DEFAULT = "hfile-refs";
|
||||
|
||||
public static final String ZOOKEEPER_ZNODE_REPLICATION_REGIONS_KEY =
|
||||
"zookeeper.znode.replication.regions";
|
||||
public static final String ZOOKEEPER_ZNODE_REPLICATION_REGIONS_DEFAULT = "regions";
|
||||
|
||||
/**
|
||||
* The name of the znode that contains all replication queues
|
||||
*/
|
||||
private final String queuesZNode;
|
||||
|
||||
/**
|
||||
* The name of the znode that contains queues of hfile references to be replicated
|
||||
*/
|
||||
private final String hfileRefsZNode;
|
||||
|
||||
final String regionsZNode;
|
||||
|
||||
public ZKReplicationQueueStorage(ZKWatcher zookeeper, Configuration conf) {
|
||||
super(zookeeper, conf);
|
||||
|
||||
String queuesZNodeName = conf.get("zookeeper.znode.replication.rs", "rs");
|
||||
String hfileRefsZNodeName = conf.get(ZOOKEEPER_ZNODE_REPLICATION_HFILE_REFS_KEY,
|
||||
ZOOKEEPER_ZNODE_REPLICATION_HFILE_REFS_DEFAULT);
|
||||
this.queuesZNode = ZNodePaths.joinZNode(replicationZNode, queuesZNodeName);
|
||||
this.hfileRefsZNode = ZNodePaths.joinZNode(replicationZNode, hfileRefsZNodeName);
|
||||
this.regionsZNode = ZNodePaths.joinZNode(replicationZNode, conf
|
||||
.get(ZOOKEEPER_ZNODE_REPLICATION_REGIONS_KEY, ZOOKEEPER_ZNODE_REPLICATION_REGIONS_DEFAULT));
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getRsNode(ServerName serverName) {
|
||||
return ZNodePaths.joinZNode(queuesZNode, serverName.getServerName());
|
||||
}
|
||||
|
||||
private String getQueueNode(ServerName serverName, String queueId) {
|
||||
return ZNodePaths.joinZNode(getRsNode(serverName), queueId);
|
||||
}
|
||||
|
||||
private String getFileNode(String queueNode, String fileName) {
|
||||
return ZNodePaths.joinZNode(queueNode, fileName);
|
||||
}
|
||||
|
||||
private String getFileNode(ServerName serverName, String queueId, String fileName) {
|
||||
return getFileNode(getQueueNode(serverName, queueId), fileName);
|
||||
}
|
||||
|
||||
/**
|
||||
* <p>
|
||||
* Put all regions under /hbase/replication/regions znode will lead to too many children because
|
||||
* of the huge number of regions in real production environment. So here we will distribute the
|
||||
* znodes to multiple directories.
|
||||
* </p>
|
||||
* <p>
|
||||
* So the final znode path will be format like this:
|
||||
*
|
||||
* <pre>
|
||||
* /hbase/replication/regions/dd/04/e76a6966d4ffa908ed0586764767-100
|
||||
* </pre>
|
||||
*
|
||||
* Here the full encoded region name is dd04e76a6966d4ffa908ed0586764767, and we use the first two
|
||||
* characters 'dd' as the first level directory name, and use the next two characters '04' as the
|
||||
* second level directory name, and the rest part as the prefix of the znode, and the suffix '100'
|
||||
* is the peer id.
|
||||
* </p>
|
||||
* @param encodedRegionName the encoded region name.
|
||||
* @param peerId peer id for replication.
|
||||
* @return ZNode path to persist the max sequence id that we've pushed for the given region and
|
||||
* peer.
|
||||
*/
|
||||
String getSerialReplicationRegionPeerNode(String encodedRegionName, String peerId) {
|
||||
if (encodedRegionName == null || encodedRegionName.length() != RegionInfo.MD5_HEX_LENGTH) {
|
||||
throw new IllegalArgumentException(
|
||||
"Invalid encoded region name: " + encodedRegionName + ", length should be 32.");
|
||||
}
|
||||
return new StringBuilder(regionsZNode).append(ZNodePaths.ZNODE_PATH_SEPARATOR)
|
||||
.append(encodedRegionName, 0, 2).append(ZNodePaths.ZNODE_PATH_SEPARATOR)
|
||||
.append(encodedRegionName, 2, 4).append(ZNodePaths.ZNODE_PATH_SEPARATOR)
|
||||
.append(encodedRegionName, 4, encodedRegionName.length()).append("-").append(peerId)
|
||||
.toString();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void removeQueue(ServerName serverName, String queueId) throws ReplicationException {
|
||||
try {
|
||||
ZKUtil.deleteNodeRecursively(zookeeper, getQueueNode(serverName, queueId));
|
||||
} catch (KeeperException e) {
|
||||
throw new ReplicationException(
|
||||
"Failed to delete queue (serverName=" + serverName + ", queueId=" + queueId + ")", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addWAL(ServerName serverName, String queueId, String fileName)
|
||||
throws ReplicationException {
|
||||
try {
|
||||
ZKUtil.createWithParents(zookeeper, getFileNode(serverName, queueId, fileName));
|
||||
} catch (KeeperException e) {
|
||||
throw new ReplicationException("Failed to add wal to queue (serverName=" + serverName
|
||||
+ ", queueId=" + queueId + ", fileName=" + fileName + ")", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void removeWAL(ServerName serverName, String queueId, String fileName)
|
||||
throws ReplicationException {
|
||||
String fileNode = getFileNode(serverName, queueId, fileName);
|
||||
try {
|
||||
ZKUtil.deleteNode(zookeeper, fileNode);
|
||||
} catch (NoNodeException e) {
|
||||
LOG.warn("{} already deleted when removing log", fileNode);
|
||||
} catch (KeeperException e) {
|
||||
throw new ReplicationException("Failed to remove wal from queue (serverName=" + serverName
|
||||
+ ", queueId=" + queueId + ", fileName=" + fileName + ")", e);
|
||||
}
|
||||
}
|
||||
|
||||
private void addLastSeqIdsToOps(String queueId, Map<String, Long> lastSeqIds,
|
||||
List<ZKUtilOp> listOfOps) throws KeeperException, ReplicationException {
|
||||
String peerId = new ReplicationQueueInfo(queueId).getPeerId();
|
||||
for (Entry<String, Long> lastSeqEntry : lastSeqIds.entrySet()) {
|
||||
String path = getSerialReplicationRegionPeerNode(lastSeqEntry.getKey(), peerId);
|
||||
Pair<Long, Integer> p = getLastSequenceIdWithVersion(lastSeqEntry.getKey(), peerId);
|
||||
byte[] data = ZKUtil.positionToByteArray(lastSeqEntry.getValue());
|
||||
if (p.getSecond() < 0) { // ZNode does not exist.
|
||||
ZKUtil.createWithParents(zookeeper,
|
||||
path.substring(0, path.lastIndexOf(ZNodePaths.ZNODE_PATH_SEPARATOR)));
|
||||
listOfOps.add(ZKUtilOp.createAndFailSilent(path, data));
|
||||
continue;
|
||||
}
|
||||
// Perform CAS in a specific version v0 (HBASE-20138)
|
||||
int v0 = p.getSecond();
|
||||
long lastPushedSeqId = p.getFirst();
|
||||
if (lastSeqEntry.getValue() <= lastPushedSeqId) {
|
||||
continue;
|
||||
}
|
||||
listOfOps.add(ZKUtilOp.setData(path, data, v0));
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setWALPosition(ServerName serverName, String queueId, String fileName, long position,
|
||||
Map<String, Long> lastSeqIds) throws ReplicationException {
|
||||
try {
|
||||
for (int retry = 0;; retry++) {
|
||||
List<ZKUtilOp> listOfOps = new ArrayList<>();
|
||||
if (position > 0) {
|
||||
listOfOps.add(ZKUtilOp.setData(getFileNode(serverName, queueId, fileName),
|
||||
ZKUtil.positionToByteArray(position)));
|
||||
}
|
||||
// Persist the max sequence id(s) of regions for serial replication atomically.
|
||||
addLastSeqIdsToOps(queueId, lastSeqIds, listOfOps);
|
||||
if (listOfOps.isEmpty()) {
|
||||
return;
|
||||
}
|
||||
try {
|
||||
ZKUtil.multiOrSequential(zookeeper, listOfOps, false);
|
||||
return;
|
||||
} catch (KeeperException.BadVersionException | KeeperException.NodeExistsException e) {
|
||||
LOG.warn(
|
||||
"Bad version(or node exist) when persist the last pushed sequence id to zookeeper "
|
||||
+ "storage, Retry = " + retry + ", serverName=" + serverName + ", queueId=" + queueId
|
||||
+ ", fileName=" + fileName);
|
||||
}
|
||||
}
|
||||
} catch (KeeperException e) {
|
||||
throw new ReplicationException("Failed to set log position (serverName=" + serverName
|
||||
+ ", queueId=" + queueId + ", fileName=" + fileName + ", position=" + position + ")", e);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the {lastPushedSequenceId, ZNodeDataVersion} pair. if ZNodeDataVersion is -1, it means
|
||||
* that the ZNode does not exist.
|
||||
*/
|
||||
protected Pair<Long, Integer> getLastSequenceIdWithVersion(String encodedRegionName,
|
||||
String peerId) throws KeeperException {
|
||||
Stat stat = new Stat();
|
||||
String path = getSerialReplicationRegionPeerNode(encodedRegionName, peerId);
|
||||
byte[] data = ZKUtil.getDataNoWatch(zookeeper, path, stat);
|
||||
if (data == null) {
|
||||
// ZNode does not exist, so just return version -1 to indicate that no node exist.
|
||||
return Pair.newPair(HConstants.NO_SEQNUM, -1);
|
||||
}
|
||||
try {
|
||||
return Pair.newPair(ZKUtil.parseWALPositionFrom(data), stat.getVersion());
|
||||
} catch (DeserializationException de) {
|
||||
LOG.warn("Failed to parse log position (region=" + encodedRegionName + ", peerId=" + peerId
|
||||
+ "), data=" + Bytes.toStringBinary(data));
|
||||
}
|
||||
return Pair.newPair(HConstants.NO_SEQNUM, stat.getVersion());
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getLastSequenceId(String encodedRegionName, String peerId)
|
||||
throws ReplicationException {
|
||||
try {
|
||||
return getLastSequenceIdWithVersion(encodedRegionName, peerId).getFirst();
|
||||
} catch (KeeperException e) {
|
||||
throw new ReplicationException("Failed to get last pushed sequence id (encodedRegionName="
|
||||
+ encodedRegionName + ", peerId=" + peerId + ")", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void setLastSequenceIds(String peerId, Map<String, Long> lastSeqIds)
|
||||
throws ReplicationException {
|
||||
try {
|
||||
// No need CAS and retry here, because it'll call setLastSequenceIds() for disabled peers
|
||||
// only, so no conflict happen.
|
||||
List<ZKUtilOp> listOfOps = new ArrayList<>();
|
||||
for (Entry<String, Long> lastSeqEntry : lastSeqIds.entrySet()) {
|
||||
String path = getSerialReplicationRegionPeerNode(lastSeqEntry.getKey(), peerId);
|
||||
ZKUtil.createWithParents(zookeeper, path);
|
||||
listOfOps.add(ZKUtilOp.setData(path, ZKUtil.positionToByteArray(lastSeqEntry.getValue())));
|
||||
}
|
||||
if (!listOfOps.isEmpty()) {
|
||||
ZKUtil.multiOrSequential(zookeeper, listOfOps, true);
|
||||
}
|
||||
} catch (KeeperException e) {
|
||||
throw new ReplicationException("Failed to set last sequence ids, peerId=" + peerId
|
||||
+ ", size of lastSeqIds=" + lastSeqIds.size(), e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void removeLastSequenceIds(String peerId) throws ReplicationException {
|
||||
String suffix = "-" + peerId;
|
||||
try {
|
||||
StringBuilder sb = new StringBuilder(regionsZNode);
|
||||
int regionsZNodeLength = regionsZNode.length();
|
||||
int levelOneLength = regionsZNodeLength + 3;
|
||||
int levelTwoLength = levelOneLength + 3;
|
||||
List<String> levelOneDirs = ZKUtil.listChildrenNoWatch(zookeeper, regionsZNode);
|
||||
// it is possible that levelOneDirs is null if we haven't write any last pushed sequence ids
|
||||
// yet, so we need an extra check here.
|
||||
if (CollectionUtils.isEmpty(levelOneDirs)) {
|
||||
return;
|
||||
}
|
||||
for (String levelOne : levelOneDirs) {
|
||||
sb.append(ZNodePaths.ZNODE_PATH_SEPARATOR).append(levelOne);
|
||||
for (String levelTwo : ZKUtil.listChildrenNoWatch(zookeeper, sb.toString())) {
|
||||
sb.append(ZNodePaths.ZNODE_PATH_SEPARATOR).append(levelTwo);
|
||||
for (String znode : ZKUtil.listChildrenNoWatch(zookeeper, sb.toString())) {
|
||||
if (znode.endsWith(suffix)) {
|
||||
sb.append(ZNodePaths.ZNODE_PATH_SEPARATOR).append(znode);
|
||||
ZKUtil.deleteNode(zookeeper, sb.toString());
|
||||
sb.setLength(levelTwoLength);
|
||||
}
|
||||
}
|
||||
sb.setLength(levelOneLength);
|
||||
}
|
||||
sb.setLength(regionsZNodeLength);
|
||||
}
|
||||
} catch (KeeperException e) {
|
||||
throw new ReplicationException("Failed to remove all last sequence ids, peerId=" + peerId, e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void removeLastSequenceIds(String peerId, List<String> encodedRegionNames)
|
||||
throws ReplicationException {
|
||||
try {
|
||||
List<ZKUtilOp> listOfOps =
|
||||
encodedRegionNames.stream().map(n -> getSerialReplicationRegionPeerNode(n, peerId))
|
||||
.map(ZKUtilOp::deleteNodeFailSilent).collect(Collectors.toList());
|
||||
ZKUtil.multiOrSequential(zookeeper, listOfOps, true);
|
||||
} catch (KeeperException e) {
|
||||
throw new ReplicationException("Failed to remove last sequence ids, peerId=" + peerId
|
||||
+ ", encodedRegionNames.size=" + encodedRegionNames.size(), e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getWALPosition(ServerName serverName, String queueId, String fileName)
|
||||
throws ReplicationException {
|
||||
byte[] bytes;
|
||||
try {
|
||||
bytes = ZKUtil.getData(zookeeper, getFileNode(serverName, queueId, fileName));
|
||||
} catch (KeeperException | InterruptedException e) {
|
||||
throw new ReplicationException("Failed to get log position (serverName=" + serverName
|
||||
+ ", queueId=" + queueId + ", fileName=" + fileName + ")", e);
|
||||
}
|
||||
try {
|
||||
return ZKUtil.parseWALPositionFrom(bytes);
|
||||
} catch (DeserializationException de) {
|
||||
LOG.warn("Failed parse log position (serverName={}, queueId={}, fileName={})", serverName,
|
||||
queueId, fileName);
|
||||
}
|
||||
// if we can not parse the position, start at the beginning of the wal file again
|
||||
return 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* This implement must update the cversion of root {@link #queuesZNode}. The optimistic lock of
|
||||
* the {@link #getAllWALs()} method is based on the cversion of root {@link #queuesZNode}.
|
||||
* @see #getAllWALs() to show the usage of the cversion of root {@link #queuesZNode} .
|
||||
*/
|
||||
@Override
|
||||
public Pair<String, SortedSet<String>> claimQueue(ServerName sourceServerName, String queueId,
|
||||
ServerName destServerName) throws ReplicationException {
|
||||
LOG.info("Atomically moving {}/{}'s WALs to {}", sourceServerName, queueId, destServerName);
|
||||
try {
|
||||
ZKUtil.createWithParents(zookeeper, getRsNode(destServerName));
|
||||
} catch (KeeperException e) {
|
||||
throw new ReplicationException("Claim queue queueId=" + queueId + " from " + sourceServerName
|
||||
+ " to " + destServerName + " failed when creating the node for " + destServerName, e);
|
||||
}
|
||||
String newQueueId = queueId + "-" + sourceServerName;
|
||||
try {
|
||||
String oldQueueNode = getQueueNode(sourceServerName, queueId);
|
||||
List<String> wals = ZKUtil.listChildrenNoWatch(zookeeper, oldQueueNode);
|
||||
if (CollectionUtils.isEmpty(wals)) {
|
||||
ZKUtil.deleteNodeFailSilent(zookeeper, oldQueueNode);
|
||||
LOG.info("Removed empty {}/{}", sourceServerName, queueId);
|
||||
return new Pair<>(newQueueId, Collections.emptySortedSet());
|
||||
}
|
||||
String newQueueNode = getQueueNode(destServerName, newQueueId);
|
||||
List<ZKUtilOp> listOfOps = new ArrayList<>();
|
||||
SortedSet<String> logQueue = new TreeSet<>();
|
||||
// create the new cluster znode
|
||||
listOfOps.add(ZKUtilOp.createAndFailSilent(newQueueNode, HConstants.EMPTY_BYTE_ARRAY));
|
||||
// get the offset of the logs and set it to new znodes
|
||||
for (String wal : wals) {
|
||||
String oldWalNode = getFileNode(oldQueueNode, wal);
|
||||
byte[] logOffset = ZKUtil.getData(this.zookeeper, oldWalNode);
|
||||
LOG.debug("Creating {} with data {}", wal, Bytes.toStringBinary(logOffset));
|
||||
String newWalNode = getFileNode(newQueueNode, wal);
|
||||
listOfOps.add(ZKUtilOp.createAndFailSilent(newWalNode, logOffset));
|
||||
listOfOps.add(ZKUtilOp.deleteNodeFailSilent(oldWalNode));
|
||||
logQueue.add(wal);
|
||||
}
|
||||
// add delete op for peer
|
||||
listOfOps.add(ZKUtilOp.deleteNodeFailSilent(oldQueueNode));
|
||||
// Append new queue id for prevent lock competition in zookeeper server.
|
||||
String claimLockZNode = ZNodePaths.joinZNode(queuesZNode, "cversion_" + newQueueId);
|
||||
// A trick for update the cversion of root queuesZNode .
|
||||
// The optimistic lock of the getAllWALs() method is based on the cversion of root queuesZNode
|
||||
listOfOps.add(ZKUtilOp.createAndFailSilent(claimLockZNode, HConstants.EMPTY_BYTE_ARRAY));
|
||||
listOfOps.add(ZKUtilOp.deleteNodeFailSilent(claimLockZNode));
|
||||
|
||||
LOG.trace("The multi list size is {}", listOfOps.size());
|
||||
ZKUtil.multiOrSequential(zookeeper, listOfOps, false);
|
||||
|
||||
LOG.info("Atomically moved {}/{}'s WALs to {}", sourceServerName, queueId, destServerName);
|
||||
return new Pair<>(newQueueId, logQueue);
|
||||
} catch (NoNodeException | NodeExistsException | NotEmptyException | BadVersionException e) {
|
||||
// Multi call failed; it looks like some other regionserver took away the logs.
|
||||
// These exceptions mean that zk tells us the request can not be execute. So return an empty
|
||||
// queue to tell the upper layer that claim nothing. For other types of exception should be
|
||||
// thrown out to notify the upper layer.
|
||||
LOG.info("Claim queue queueId={} from {} to {} failed with {}, someone else took the log?",
|
||||
queueId, sourceServerName, destServerName, e.toString());
|
||||
return new Pair<>(newQueueId, Collections.emptySortedSet());
|
||||
} catch (KeeperException | InterruptedException e) {
|
||||
throw new ReplicationException("Claim queue queueId=" + queueId + " from " + sourceServerName
|
||||
+ " to " + destServerName + " failed", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void removeReplicatorIfQueueIsEmpty(ServerName serverName) throws ReplicationException {
|
||||
try {
|
||||
ZKUtil.deleteNodeFailSilent(zookeeper, getRsNode(serverName));
|
||||
} catch (NotEmptyException e) {
|
||||
// keep silence to avoid logging too much.
|
||||
} catch (KeeperException e) {
|
||||
throw new ReplicationException("Failed to remove replicator for " + serverName, e);
|
||||
}
|
||||
}
|
||||
|
||||
private List<ServerName> getListOfReplicators0() throws KeeperException {
|
||||
List<String> children = ZKUtil.listChildrenNoWatch(zookeeper, queuesZNode);
|
||||
if (children == null) {
|
||||
children = Collections.emptyList();
|
||||
}
|
||||
return children.stream().map(ServerName::parseServerName).collect(toList());
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<ServerName> getListOfReplicators() throws ReplicationException {
|
||||
try {
|
||||
return getListOfReplicators0();
|
||||
} catch (KeeperException e) {
|
||||
throw new ReplicationException("Failed to get list of replicators", e);
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> getWALsInQueue0(ServerName serverName, String queueId)
|
||||
throws KeeperException {
|
||||
List<String> children =
|
||||
ZKUtil.listChildrenNoWatch(zookeeper, getQueueNode(serverName, queueId));
|
||||
return children != null ? children : Collections.emptyList();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getWALsInQueue(ServerName serverName, String queueId)
|
||||
throws ReplicationException {
|
||||
try {
|
||||
return getWALsInQueue0(serverName, queueId);
|
||||
} catch (KeeperException e) {
|
||||
throw new ReplicationException(
|
||||
"Failed to get wals in queue (serverName=" + serverName + ", queueId=" + queueId + ")", e);
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> getAllQueues0(ServerName serverName) throws KeeperException {
|
||||
List<String> children = ZKUtil.listChildrenNoWatch(zookeeper, getRsNode(serverName));
|
||||
return children != null ? children : Collections.emptyList();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getAllQueues(ServerName serverName) throws ReplicationException {
|
||||
try {
|
||||
return getAllQueues0(serverName);
|
||||
} catch (KeeperException e) {
|
||||
throw new ReplicationException("Failed to get all queues (serverName=" + serverName + ")", e);
|
||||
}
|
||||
}
|
||||
|
||||
// will be overridden in UTs
|
||||
protected int getQueuesZNodeCversion() throws KeeperException {
|
||||
Stat stat = new Stat();
|
||||
ZKUtil.getDataNoWatch(this.zookeeper, this.queuesZNode, stat);
|
||||
return stat.getCversion();
|
||||
}
|
||||
|
||||
/**
|
||||
* The optimistic lock of this implement is based on the cversion of root {@link #queuesZNode}.
|
||||
* Therefore, we must update the cversion of root {@link #queuesZNode} when migrate wal nodes to
|
||||
* other queues.
|
||||
* @see #claimQueue(ServerName, String, ServerName) as an example of updating root
|
||||
* {@link #queuesZNode} cversion.
|
||||
*/
|
||||
@Override
|
||||
public Set<String> getAllWALs() throws ReplicationException {
|
||||
try {
|
||||
for (int retry = 0;; retry++) {
|
||||
int v0 = getQueuesZNodeCversion();
|
||||
List<ServerName> rss = getListOfReplicators0();
|
||||
if (rss.isEmpty()) {
|
||||
LOG.debug("Didn't find a RegionServer that replicates, won't prevent deletions.");
|
||||
return Collections.emptySet();
|
||||
}
|
||||
Set<String> wals = new HashSet<>();
|
||||
for (ServerName rs : rss) {
|
||||
for (String queueId : getAllQueues0(rs)) {
|
||||
wals.addAll(getWALsInQueue0(rs, queueId));
|
||||
}
|
||||
}
|
||||
int v1 = getQueuesZNodeCversion();
|
||||
if (v0 == v1) {
|
||||
return wals;
|
||||
}
|
||||
LOG.info("Replication queue node cversion changed from %d to %d, retry = %d", v0, v1,
|
||||
retry);
|
||||
}
|
||||
} catch (KeeperException e) {
|
||||
throw new ReplicationException("Failed to get all wals", e);
|
||||
}
|
||||
}
|
||||
|
||||
private String getHFileRefsPeerNode(String peerId) {
|
||||
return ZNodePaths.joinZNode(hfileRefsZNode, peerId);
|
||||
}
|
||||
|
||||
private String getHFileNode(String peerNode, String fileName) {
|
||||
return ZNodePaths.joinZNode(peerNode, fileName);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addPeerToHFileRefs(String peerId) throws ReplicationException {
|
||||
String peerNode = getHFileRefsPeerNode(peerId);
|
||||
try {
|
||||
if (ZKUtil.checkExists(zookeeper, peerNode) == -1) {
|
||||
LOG.info("Adding peer {} to hfile reference queue.", peerId);
|
||||
ZKUtil.createWithParents(zookeeper, peerNode);
|
||||
}
|
||||
} catch (KeeperException e) {
|
||||
throw new ReplicationException("Failed to add peer " + peerId + " to hfile reference queue.",
|
||||
e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void removePeerFromHFileRefs(String peerId) throws ReplicationException {
|
||||
String peerNode = getHFileRefsPeerNode(peerId);
|
||||
try {
|
||||
if (ZKUtil.checkExists(zookeeper, peerNode) == -1) {
|
||||
LOG.debug("Peer {} not found in hfile reference queue.", peerNode);
|
||||
} else {
|
||||
LOG.info("Removing peer {} from hfile reference queue.", peerNode);
|
||||
ZKUtil.deleteNodeRecursively(zookeeper, peerNode);
|
||||
}
|
||||
} catch (KeeperException e) {
|
||||
throw new ReplicationException(
|
||||
"Failed to remove peer " + peerId + " from hfile reference queue.", e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void addHFileRefs(String peerId, List<Pair<Path, Path>> pairs)
|
||||
throws ReplicationException {
|
||||
String peerNode = getHFileRefsPeerNode(peerId);
|
||||
LOG.debug("Adding hfile references {} in queue {}", pairs, peerNode);
|
||||
List<ZKUtilOp> listOfOps =
|
||||
pairs.stream().map(p -> p.getSecond().getName()).map(n -> getHFileNode(peerNode, n))
|
||||
.map(f -> ZKUtilOp.createAndFailSilent(f, HConstants.EMPTY_BYTE_ARRAY)).collect(toList());
|
||||
LOG.debug("The multi list size for adding hfile references in zk for node {} is {}", peerNode,
|
||||
listOfOps.size());
|
||||
try {
|
||||
ZKUtil.multiOrSequential(this.zookeeper, listOfOps, true);
|
||||
} catch (KeeperException e) {
|
||||
throw new ReplicationException("Failed to add hfile reference to peer " + peerId, e);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void removeHFileRefs(String peerId, List<String> files) throws ReplicationException {
|
||||
String peerNode = getHFileRefsPeerNode(peerId);
|
||||
LOG.debug("Removing hfile references {} from queue {}", files, peerNode);
|
||||
|
||||
List<ZKUtilOp> listOfOps = files.stream().map(n -> getHFileNode(peerNode, n))
|
||||
.map(ZKUtilOp::deleteNodeFailSilent).collect(toList());
|
||||
LOG.debug("The multi list size for removing hfile references in zk for node {} is {}", peerNode,
|
||||
listOfOps.size());
|
||||
try {
|
||||
ZKUtil.multiOrSequential(this.zookeeper, listOfOps, true);
|
||||
} catch (KeeperException e) {
|
||||
throw new ReplicationException("Failed to remove hfile reference from peer " + peerId, e);
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> getAllPeersFromHFileRefsQueue0() throws KeeperException {
|
||||
List<String> children = ZKUtil.listChildrenNoWatch(zookeeper, hfileRefsZNode);
|
||||
return children != null ? children : Collections.emptyList();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getAllPeersFromHFileRefsQueue() throws ReplicationException {
|
||||
try {
|
||||
return getAllPeersFromHFileRefsQueue0();
|
||||
} catch (KeeperException e) {
|
||||
throw new ReplicationException("Failed to get list of all peers in hfile references node.",
|
||||
e);
|
||||
}
|
||||
}
|
||||
|
||||
private List<String> getReplicableHFiles0(String peerId) throws KeeperException {
|
||||
List<String> children =
|
||||
ZKUtil.listChildrenNoWatch(this.zookeeper, getHFileRefsPeerNode(peerId));
|
||||
return children != null ? children : Collections.emptyList();
|
||||
}
|
||||
|
||||
@Override
|
||||
public List<String> getReplicableHFiles(String peerId) throws ReplicationException {
|
||||
try {
|
||||
return getReplicableHFiles0(peerId);
|
||||
} catch (KeeperException e) {
|
||||
throw new ReplicationException("Failed to get list of hfile references for peer " + peerId,
|
||||
e);
|
||||
}
|
||||
}
|
||||
|
||||
// will be overridden in UTs
|
||||
protected int getHFileRefsZNodeCversion() throws ReplicationException {
|
||||
Stat stat = new Stat();
|
||||
try {
|
||||
ZKUtil.getDataNoWatch(zookeeper, hfileRefsZNode, stat);
|
||||
} catch (KeeperException e) {
|
||||
throw new ReplicationException("Failed to get stat of replication hfile references node.", e);
|
||||
}
|
||||
return stat.getCversion();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<String> getAllHFileRefs() throws ReplicationException {
|
||||
try {
|
||||
for (int retry = 0;; retry++) {
|
||||
int v0 = getHFileRefsZNodeCversion();
|
||||
List<String> peers = getAllPeersFromHFileRefsQueue();
|
||||
if (peers.isEmpty()) {
|
||||
LOG.debug("Didn't find any peers with hfile references, won't prevent deletions.");
|
||||
return Collections.emptySet();
|
||||
}
|
||||
Set<String> hfileRefs = new HashSet<>();
|
||||
for (String peer : peers) {
|
||||
hfileRefs.addAll(getReplicableHFiles0(peer));
|
||||
}
|
||||
int v1 = getHFileRefsZNodeCversion();
|
||||
if (v0 == v1) {
|
||||
return hfileRefs;
|
||||
}
|
||||
LOG.debug("Replication hfile references node cversion changed from %d to %d, retry = %d",
|
||||
v0, v1, retry);
|
||||
}
|
||||
} catch (KeeperException e) {
|
||||
throw new ReplicationException("Failed to get all hfile refs", e);
|
||||
}
|
||||
}
|
||||
}
|
|
@ -17,30 +17,18 @@
|
|||
*/
|
||||
package org.apache.hadoop.hbase.replication;
|
||||
|
||||
import static org.hamcrest.CoreMatchers.hasItems;
|
||||
import static org.hamcrest.MatcherAssert.assertThat;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.fail;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeer.PeerState;
|
||||
import org.apache.hadoop.hbase.util.Pair;
|
||||
import org.apache.hadoop.hbase.zookeeper.MiniZooKeeperCluster;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKConfig;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
import org.junit.Test;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap;
|
||||
|
||||
/**
|
||||
* White box testing for replication state interfaces. Implementations should extend this class, and
|
||||
* initialize the interfaces properly.
|
||||
|
@ -49,7 +37,6 @@ public abstract class TestReplicationStateBasic {
|
|||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(TestReplicationStateBasic.class);
|
||||
|
||||
protected ReplicationQueueStorage rqs;
|
||||
protected ServerName server1 = ServerName.valueOf("hostname1.example.org", 1234, 12345);
|
||||
protected ServerName server2 = ServerName.valueOf("hostname2.example.org", 1234, 12345);
|
||||
protected ServerName server3 = ServerName.valueOf("hostname3.example.org", 1234, 12345);
|
||||
|
@ -66,161 +53,6 @@ public abstract class TestReplicationStateBasic {
|
|||
protected static final int ZK_MAX_COUNT = 300;
|
||||
protected static final int ZK_SLEEP_INTERVAL = 100; // millis
|
||||
|
||||
@Test
|
||||
public void testReplicationQueueStorage() throws ReplicationException {
|
||||
// Test methods with empty state
|
||||
assertEquals(0, rqs.getListOfReplicators().size());
|
||||
assertTrue(rqs.getWALsInQueue(server1, "qId1").isEmpty());
|
||||
assertTrue(rqs.getAllQueues(server1).isEmpty());
|
||||
|
||||
/*
|
||||
* Set up data Two replicators: -- server1: three queues with 0, 1 and 2 log files each --
|
||||
* server2: zero queues
|
||||
*/
|
||||
rqs.addWAL(server1, "qId1", "trash");
|
||||
rqs.removeWAL(server1, "qId1", "trash");
|
||||
rqs.addWAL(server1, "qId2", "filename1");
|
||||
rqs.addWAL(server1, "qId3", "filename2");
|
||||
rqs.addWAL(server1, "qId3", "filename3");
|
||||
rqs.addWAL(server2, "trash", "trash");
|
||||
rqs.removeQueue(server2, "trash");
|
||||
|
||||
List<ServerName> reps = rqs.getListOfReplicators();
|
||||
assertEquals(2, reps.size());
|
||||
assertTrue(server1.getServerName(), reps.contains(server1));
|
||||
assertTrue(server2.getServerName(), reps.contains(server2));
|
||||
|
||||
assertTrue(rqs.getWALsInQueue(ServerName.valueOf("bogus", 12345, 12345), "bogus").isEmpty());
|
||||
assertTrue(rqs.getWALsInQueue(server1, "bogus").isEmpty());
|
||||
assertEquals(0, rqs.getWALsInQueue(server1, "qId1").size());
|
||||
assertEquals(1, rqs.getWALsInQueue(server1, "qId2").size());
|
||||
assertEquals("filename1", rqs.getWALsInQueue(server1, "qId2").get(0));
|
||||
|
||||
assertTrue(rqs.getAllQueues(ServerName.valueOf("bogus", 12345, -1L)).isEmpty());
|
||||
assertEquals(0, rqs.getAllQueues(server2).size());
|
||||
List<String> list = rqs.getAllQueues(server1);
|
||||
assertEquals(3, list.size());
|
||||
assertTrue(list.contains("qId2"));
|
||||
assertTrue(list.contains("qId3"));
|
||||
}
|
||||
|
||||
private void removeAllQueues(ServerName serverName) throws ReplicationException {
|
||||
for (String queue : rqs.getAllQueues(serverName)) {
|
||||
rqs.removeQueue(serverName, queue);
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReplicationQueues() throws ReplicationException {
|
||||
// Initialize ReplicationPeer so we can add peers (we don't transfer lone queues)
|
||||
rp.init();
|
||||
|
||||
rqs.removeQueue(server1, "bogus");
|
||||
rqs.removeWAL(server1, "bogus", "bogus");
|
||||
removeAllQueues(server1);
|
||||
assertEquals(0, rqs.getAllQueues(server1).size());
|
||||
assertEquals(0, rqs.getWALPosition(server1, "bogus", "bogus"));
|
||||
assertTrue(rqs.getWALsInQueue(server1, "bogus").isEmpty());
|
||||
assertTrue(rqs.getAllQueues(ServerName.valueOf("bogus", 1234, 12345)).isEmpty());
|
||||
|
||||
populateQueues();
|
||||
|
||||
assertEquals(3, rqs.getListOfReplicators().size());
|
||||
assertEquals(0, rqs.getWALsInQueue(server2, "qId1").size());
|
||||
assertEquals(5, rqs.getWALsInQueue(server3, "qId5").size());
|
||||
assertEquals(0, rqs.getWALPosition(server3, "qId1", "filename0"));
|
||||
rqs.setWALPosition(server3, "qId5", "filename4", 354L, Collections.emptyMap());
|
||||
assertEquals(354L, rqs.getWALPosition(server3, "qId5", "filename4"));
|
||||
|
||||
assertEquals(5, rqs.getWALsInQueue(server3, "qId5").size());
|
||||
assertEquals(0, rqs.getWALsInQueue(server2, "qId1").size());
|
||||
assertEquals(0, rqs.getAllQueues(server1).size());
|
||||
assertEquals(1, rqs.getAllQueues(server2).size());
|
||||
assertEquals(5, rqs.getAllQueues(server3).size());
|
||||
|
||||
assertEquals(0, rqs.getAllQueues(server1).size());
|
||||
rqs.removeReplicatorIfQueueIsEmpty(server1);
|
||||
assertEquals(2, rqs.getListOfReplicators().size());
|
||||
|
||||
List<String> queues = rqs.getAllQueues(server3);
|
||||
assertEquals(5, queues.size());
|
||||
for (String queue : queues) {
|
||||
rqs.claimQueue(server3, queue, server2);
|
||||
}
|
||||
rqs.removeReplicatorIfQueueIsEmpty(server3);
|
||||
assertEquals(1, rqs.getListOfReplicators().size());
|
||||
|
||||
assertEquals(6, rqs.getAllQueues(server2).size());
|
||||
removeAllQueues(server2);
|
||||
rqs.removeReplicatorIfQueueIsEmpty(server2);
|
||||
assertEquals(0, rqs.getListOfReplicators().size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHfileRefsReplicationQueues() throws ReplicationException, KeeperException {
|
||||
rp.init();
|
||||
|
||||
List<Pair<Path, Path>> files1 = new ArrayList<>(3);
|
||||
files1.add(new Pair<>(null, new Path("file_1")));
|
||||
files1.add(new Pair<>(null, new Path("file_2")));
|
||||
files1.add(new Pair<>(null, new Path("file_3")));
|
||||
assertTrue(rqs.getReplicableHFiles(ID_ONE).isEmpty());
|
||||
assertEquals(0, rqs.getAllPeersFromHFileRefsQueue().size());
|
||||
rp.getPeerStorage().addPeer(ID_ONE,
|
||||
ReplicationPeerConfig.newBuilder().setClusterKey(KEY_ONE).build(), true,
|
||||
SyncReplicationState.NONE);
|
||||
rqs.addPeerToHFileRefs(ID_ONE);
|
||||
rqs.addHFileRefs(ID_ONE, files1);
|
||||
assertEquals(1, rqs.getAllPeersFromHFileRefsQueue().size());
|
||||
assertEquals(3, rqs.getReplicableHFiles(ID_ONE).size());
|
||||
List<String> hfiles2 = new ArrayList<>(files1.size());
|
||||
for (Pair<Path, Path> p : files1) {
|
||||
hfiles2.add(p.getSecond().getName());
|
||||
}
|
||||
String removedString = hfiles2.remove(0);
|
||||
rqs.removeHFileRefs(ID_ONE, hfiles2);
|
||||
assertEquals(1, rqs.getReplicableHFiles(ID_ONE).size());
|
||||
hfiles2 = new ArrayList<>(1);
|
||||
hfiles2.add(removedString);
|
||||
rqs.removeHFileRefs(ID_ONE, hfiles2);
|
||||
assertEquals(0, rqs.getReplicableHFiles(ID_ONE).size());
|
||||
rp.getPeerStorage().removePeer(ID_ONE);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRemovePeerForHFileRefs() throws ReplicationException, KeeperException {
|
||||
rp.init();
|
||||
rp.getPeerStorage().addPeer(ID_ONE,
|
||||
ReplicationPeerConfig.newBuilder().setClusterKey(KEY_ONE).build(), true,
|
||||
SyncReplicationState.NONE);
|
||||
rqs.addPeerToHFileRefs(ID_ONE);
|
||||
rp.getPeerStorage().addPeer(ID_TWO,
|
||||
ReplicationPeerConfig.newBuilder().setClusterKey(KEY_TWO).build(), true,
|
||||
SyncReplicationState.NONE);
|
||||
rqs.addPeerToHFileRefs(ID_TWO);
|
||||
|
||||
List<Pair<Path, Path>> files1 = new ArrayList<>(3);
|
||||
files1.add(new Pair<>(null, new Path("file_1")));
|
||||
files1.add(new Pair<>(null, new Path("file_2")));
|
||||
files1.add(new Pair<>(null, new Path("file_3")));
|
||||
rqs.addHFileRefs(ID_ONE, files1);
|
||||
rqs.addHFileRefs(ID_TWO, files1);
|
||||
assertEquals(2, rqs.getAllPeersFromHFileRefsQueue().size());
|
||||
assertEquals(3, rqs.getReplicableHFiles(ID_ONE).size());
|
||||
assertEquals(3, rqs.getReplicableHFiles(ID_TWO).size());
|
||||
|
||||
rp.getPeerStorage().removePeer(ID_ONE);
|
||||
rqs.removePeerFromHFileRefs(ID_ONE);
|
||||
assertEquals(1, rqs.getAllPeersFromHFileRefsQueue().size());
|
||||
assertTrue(rqs.getReplicableHFiles(ID_ONE).isEmpty());
|
||||
assertEquals(3, rqs.getReplicableHFiles(ID_TWO).size());
|
||||
|
||||
rp.getPeerStorage().removePeer(ID_TWO);
|
||||
rqs.removePeerFromHFileRefs(ID_TWO);
|
||||
assertEquals(0, rqs.getAllPeersFromHFileRefsQueue().size());
|
||||
assertTrue(rqs.getReplicableHFiles(ID_TWO).isEmpty());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReplicationPeers() throws Exception {
|
||||
rp.init();
|
||||
|
@ -286,55 +118,7 @@ public abstract class TestReplicationStateBasic {
|
|||
assertNumberOfPeers(2);
|
||||
}
|
||||
|
||||
private String getFileName(String base, int i) {
|
||||
return String.format(base + "-%04d", i);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPersistLogPositionAndSeqIdAtomically() throws Exception {
|
||||
ServerName serverName1 = ServerName.valueOf("127.0.0.1", 8000, 10000);
|
||||
assertTrue(rqs.getAllQueues(serverName1).isEmpty());
|
||||
String queue1 = "1";
|
||||
String region0 = "6b2c8f8555335cc9af74455b94516cbe",
|
||||
region1 = "6ecd2e9e010499f8ddef97ee8f70834f";
|
||||
for (int i = 0; i < 10; i++) {
|
||||
rqs.addWAL(serverName1, queue1, getFileName("file1", i));
|
||||
}
|
||||
List<String> queueIds = rqs.getAllQueues(serverName1);
|
||||
assertEquals(1, queueIds.size());
|
||||
assertThat(queueIds, hasItems("1"));
|
||||
|
||||
List<String> wals1 = rqs.getWALsInQueue(serverName1, queue1);
|
||||
assertEquals(10, wals1.size());
|
||||
for (int i = 0; i < 10; i++) {
|
||||
assertThat(wals1, hasItems(getFileName("file1", i)));
|
||||
}
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
assertEquals(0, rqs.getWALPosition(serverName1, queue1, getFileName("file1", i)));
|
||||
}
|
||||
assertEquals(HConstants.NO_SEQNUM, rqs.getLastSequenceId(region0, queue1));
|
||||
assertEquals(HConstants.NO_SEQNUM, rqs.getLastSequenceId(region1, queue1));
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
rqs.setWALPosition(serverName1, queue1, getFileName("file1", i), (i + 1) * 100,
|
||||
ImmutableMap.of(region0, i * 100L, region1, (i + 1) * 100L));
|
||||
}
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
assertEquals((i + 1) * 100, rqs.getWALPosition(serverName1, queue1, getFileName("file1", i)));
|
||||
}
|
||||
assertEquals(900L, rqs.getLastSequenceId(region0, queue1));
|
||||
assertEquals(1000L, rqs.getLastSequenceId(region1, queue1));
|
||||
|
||||
// Try to decrease the last pushed id by setWALPosition method.
|
||||
rqs.setWALPosition(serverName1, queue1, getFileName("file1", 0), 11 * 100,
|
||||
ImmutableMap.of(region0, 899L, region1, 1001L));
|
||||
assertEquals(900L, rqs.getLastSequenceId(region0, queue1));
|
||||
assertEquals(1001L, rqs.getLastSequenceId(region1, queue1));
|
||||
}
|
||||
|
||||
protected void assertConnectedPeerStatus(boolean status, String peerId) throws Exception {
|
||||
private void assertConnectedPeerStatus(boolean status, String peerId) throws Exception {
|
||||
// we can first check if the value was changed in the store, if it wasn't then fail right away
|
||||
if (status != rp.getPeerStorage().isPeerEnabled(peerId)) {
|
||||
fail("ConnectedPeerStatus was " + !status + " but expected " + status + " in ZK");
|
||||
|
@ -353,30 +137,7 @@ public abstract class TestReplicationStateBasic {
|
|||
}
|
||||
}
|
||||
|
||||
protected void assertNumberOfPeers(int total) throws ReplicationException {
|
||||
private void assertNumberOfPeers(int total) throws ReplicationException {
|
||||
assertEquals(total, rp.getPeerStorage().listPeerIds().size());
|
||||
}
|
||||
|
||||
/*
|
||||
* three replicators: rq1 has 0 queues, rq2 has 1 queue with no logs, rq3 has 5 queues with 1, 2,
|
||||
* 3, 4, 5 log files respectively
|
||||
*/
|
||||
protected void populateQueues() throws ReplicationException {
|
||||
rqs.addWAL(server1, "trash", "trash");
|
||||
rqs.removeQueue(server1, "trash");
|
||||
|
||||
rqs.addWAL(server2, "qId1", "trash");
|
||||
rqs.removeWAL(server2, "qId1", "trash");
|
||||
|
||||
for (int i = 1; i < 6; i++) {
|
||||
for (int j = 0; j < i; j++) {
|
||||
rqs.addWAL(server3, "qId" + i, "filename" + j);
|
||||
}
|
||||
// Add peers for the corresponding queues so they are not orphans
|
||||
rp.getPeerStorage().addPeer("qId" + i,
|
||||
ReplicationPeerConfig.newBuilder()
|
||||
.setClusterKey(MiniZooKeeperCluster.HOST + ":2818:/bogus" + i).build(),
|
||||
true, SyncReplicationState.NONE);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -80,7 +80,6 @@ public class TestReplicationStateZKImpl extends TestReplicationStateBasic {
|
|||
@Before
|
||||
public void setUp() throws IOException {
|
||||
zkTimeoutCount = 0;
|
||||
rqs = ReplicationStorageFactory.getReplicationQueueStorage(zkw, conf);
|
||||
rp =
|
||||
ReplicationFactory.getReplicationPeers(FileSystem.get(utility.getConfiguration()), zkw, conf);
|
||||
OUR_KEY = ZKConfig.getZooKeeperClusterKey(conf);
|
||||
|
|
|
@ -1,341 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.replication;
|
||||
|
||||
import static org.hamcrest.CoreMatchers.hasItems;
|
||||
import static org.hamcrest.MatcherAssert.assertThat;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import java.util.SortedSet;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseZKTestingUtil;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||
import org.apache.hadoop.hbase.testclassification.ReplicationTests;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.util.MD5Hash;
|
||||
import org.apache.hadoop.hbase.util.Pair;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
import org.junit.After;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap;
|
||||
|
||||
@Category({ ReplicationTests.class, MediumTests.class })
|
||||
public class TestZKReplicationQueueStorage {
|
||||
|
||||
@ClassRule
|
||||
public static final HBaseClassTestRule CLASS_RULE =
|
||||
HBaseClassTestRule.forClass(TestZKReplicationQueueStorage.class);
|
||||
|
||||
private static final HBaseZKTestingUtil UTIL = new HBaseZKTestingUtil();
|
||||
|
||||
private static ZKReplicationQueueStorage STORAGE;
|
||||
|
||||
@BeforeClass
|
||||
public static void setUp() throws Exception {
|
||||
UTIL.startMiniZKCluster();
|
||||
STORAGE = new ZKReplicationQueueStorage(UTIL.getZooKeeperWatcher(), UTIL.getConfiguration());
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void tearDown() throws IOException {
|
||||
UTIL.shutdownMiniZKCluster();
|
||||
}
|
||||
|
||||
@After
|
||||
public void tearDownAfterTest() throws ReplicationException, KeeperException, IOException {
|
||||
for (ServerName serverName : STORAGE.getListOfReplicators()) {
|
||||
for (String queue : STORAGE.getAllQueues(serverName)) {
|
||||
STORAGE.removeQueue(serverName, queue);
|
||||
}
|
||||
STORAGE.removeReplicatorIfQueueIsEmpty(serverName);
|
||||
}
|
||||
for (String peerId : STORAGE.getAllPeersFromHFileRefsQueue()) {
|
||||
STORAGE.removePeerFromHFileRefs(peerId);
|
||||
}
|
||||
}
|
||||
|
||||
private ServerName getServerName(int i) {
|
||||
return ServerName.valueOf("127.0.0.1", 8000 + i, 10000 + i);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReplicator() throws ReplicationException {
|
||||
assertTrue(STORAGE.getListOfReplicators().isEmpty());
|
||||
String queueId = "1";
|
||||
for (int i = 0; i < 10; i++) {
|
||||
STORAGE.addWAL(getServerName(i), queueId, "file" + i);
|
||||
}
|
||||
List<ServerName> replicators = STORAGE.getListOfReplicators();
|
||||
assertEquals(10, replicators.size());
|
||||
for (int i = 0; i < 10; i++) {
|
||||
assertThat(replicators, hasItems(getServerName(i)));
|
||||
}
|
||||
for (int i = 0; i < 5; i++) {
|
||||
STORAGE.removeQueue(getServerName(i), queueId);
|
||||
}
|
||||
for (int i = 0; i < 10; i++) {
|
||||
STORAGE.removeReplicatorIfQueueIsEmpty(getServerName(i));
|
||||
}
|
||||
replicators = STORAGE.getListOfReplicators();
|
||||
assertEquals(5, replicators.size());
|
||||
for (int i = 5; i < 10; i++) {
|
||||
assertThat(replicators, hasItems(getServerName(i)));
|
||||
}
|
||||
}
|
||||
|
||||
private String getFileName(String base, int i) {
|
||||
return String.format(base + "-%04d", i);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testAddRemoveLog() throws ReplicationException {
|
||||
ServerName serverName1 = ServerName.valueOf("127.0.0.1", 8000, 10000);
|
||||
assertTrue(STORAGE.getAllQueues(serverName1).isEmpty());
|
||||
String queue1 = "1";
|
||||
String queue2 = "2";
|
||||
for (int i = 0; i < 10; i++) {
|
||||
STORAGE.addWAL(serverName1, queue1, getFileName("file1", i));
|
||||
STORAGE.addWAL(serverName1, queue2, getFileName("file2", i));
|
||||
}
|
||||
List<String> queueIds = STORAGE.getAllQueues(serverName1);
|
||||
assertEquals(2, queueIds.size());
|
||||
assertThat(queueIds, hasItems("1", "2"));
|
||||
|
||||
List<String> wals1 = STORAGE.getWALsInQueue(serverName1, queue1);
|
||||
List<String> wals2 = STORAGE.getWALsInQueue(serverName1, queue2);
|
||||
assertEquals(10, wals1.size());
|
||||
assertEquals(10, wals2.size());
|
||||
for (int i = 0; i < 10; i++) {
|
||||
assertThat(wals1, hasItems(getFileName("file1", i)));
|
||||
assertThat(wals2, hasItems(getFileName("file2", i)));
|
||||
}
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
assertEquals(0, STORAGE.getWALPosition(serverName1, queue1, getFileName("file1", i)));
|
||||
assertEquals(0, STORAGE.getWALPosition(serverName1, queue2, getFileName("file2", i)));
|
||||
STORAGE.setWALPosition(serverName1, queue1, getFileName("file1", i), (i + 1) * 100,
|
||||
Collections.emptyMap());
|
||||
STORAGE.setWALPosition(serverName1, queue2, getFileName("file2", i), (i + 1) * 100 + 10,
|
||||
Collections.emptyMap());
|
||||
}
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
assertEquals((i + 1) * 100,
|
||||
STORAGE.getWALPosition(serverName1, queue1, getFileName("file1", i)));
|
||||
assertEquals((i + 1) * 100 + 10,
|
||||
STORAGE.getWALPosition(serverName1, queue2, getFileName("file2", i)));
|
||||
}
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
if (i % 2 == 0) {
|
||||
STORAGE.removeWAL(serverName1, queue1, getFileName("file1", i));
|
||||
} else {
|
||||
STORAGE.removeWAL(serverName1, queue2, getFileName("file2", i));
|
||||
}
|
||||
}
|
||||
|
||||
queueIds = STORAGE.getAllQueues(serverName1);
|
||||
assertEquals(2, queueIds.size());
|
||||
assertThat(queueIds, hasItems("1", "2"));
|
||||
|
||||
ServerName serverName2 = ServerName.valueOf("127.0.0.1", 8001, 10001);
|
||||
Pair<String, SortedSet<String>> peer1 = STORAGE.claimQueue(serverName1, "1", serverName2);
|
||||
|
||||
assertEquals("1-" + serverName1.getServerName(), peer1.getFirst());
|
||||
assertEquals(5, peer1.getSecond().size());
|
||||
int i = 1;
|
||||
for (String wal : peer1.getSecond()) {
|
||||
assertEquals(getFileName("file1", i), wal);
|
||||
assertEquals((i + 1) * 100,
|
||||
STORAGE.getWALPosition(serverName2, peer1.getFirst(), getFileName("file1", i)));
|
||||
i += 2;
|
||||
}
|
||||
|
||||
queueIds = STORAGE.getAllQueues(serverName1);
|
||||
assertEquals(1, queueIds.size());
|
||||
assertThat(queueIds, hasItems("2"));
|
||||
wals2 = STORAGE.getWALsInQueue(serverName1, queue2);
|
||||
assertEquals(5, wals2.size());
|
||||
for (i = 0; i < 10; i += 2) {
|
||||
assertThat(wals2, hasItems(getFileName("file2", i)));
|
||||
}
|
||||
|
||||
queueIds = STORAGE.getAllQueues(serverName2);
|
||||
assertEquals(1, queueIds.size());
|
||||
assertThat(queueIds, hasItems(peer1.getFirst()));
|
||||
wals1 = STORAGE.getWALsInQueue(serverName2, peer1.getFirst());
|
||||
assertEquals(5, wals1.size());
|
||||
for (i = 1; i < 10; i += 2) {
|
||||
assertThat(wals1, hasItems(getFileName("file1", i)));
|
||||
}
|
||||
|
||||
Set<String> allWals = STORAGE.getAllWALs();
|
||||
assertEquals(10, allWals.size());
|
||||
for (i = 0; i < 10; i++) {
|
||||
assertThat(allWals, hasItems(i % 2 == 0 ? getFileName("file2", i) : getFileName("file1", i)));
|
||||
}
|
||||
}
|
||||
|
||||
// For HBASE-12865, HBASE-26482
|
||||
@Test
|
||||
public void testClaimQueueChangeCversion() throws ReplicationException, KeeperException {
|
||||
ServerName serverName1 = ServerName.valueOf("127.0.0.1", 8000, 10000);
|
||||
STORAGE.addWAL(serverName1, "1", "file");
|
||||
STORAGE.addWAL(serverName1, "2", "file");
|
||||
|
||||
ServerName serverName2 = ServerName.valueOf("127.0.0.1", 8001, 10001);
|
||||
// Avoid claimQueue update cversion for prepare server2 rsNode.
|
||||
STORAGE.addWAL(serverName2, "1", "file");
|
||||
STORAGE.addWAL(serverName2, "2", "file");
|
||||
|
||||
int v0 = STORAGE.getQueuesZNodeCversion();
|
||||
|
||||
STORAGE.claimQueue(serverName1, "1", serverName2);
|
||||
int v1 = STORAGE.getQueuesZNodeCversion();
|
||||
// cversion should be increased by claimQueue method.
|
||||
assertTrue(v1 > v0);
|
||||
|
||||
STORAGE.claimQueue(serverName1, "2", serverName2);
|
||||
int v2 = STORAGE.getQueuesZNodeCversion();
|
||||
// cversion should be increased by claimQueue method.
|
||||
assertTrue(v2 > v1);
|
||||
}
|
||||
|
||||
private ZKReplicationQueueStorage createWithUnstableVersion() throws IOException {
|
||||
return new ZKReplicationQueueStorage(UTIL.getZooKeeperWatcher(), UTIL.getConfiguration()) {
|
||||
|
||||
private int called = 0;
|
||||
private int getLastSeqIdOpIndex = 0;
|
||||
|
||||
@Override
|
||||
protected int getQueuesZNodeCversion() throws KeeperException {
|
||||
if (called < 4) {
|
||||
called++;
|
||||
}
|
||||
return called;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Pair<Long, Integer> getLastSequenceIdWithVersion(String encodedRegionName,
|
||||
String peerId) throws KeeperException {
|
||||
Pair<Long, Integer> oldPair = super.getLastSequenceIdWithVersion(encodedRegionName, peerId);
|
||||
if (getLastSeqIdOpIndex < 100) {
|
||||
// Let the ZNode version increase.
|
||||
String path = getSerialReplicationRegionPeerNode(encodedRegionName, peerId);
|
||||
ZKUtil.createWithParents(zookeeper, path);
|
||||
ZKUtil.setData(zookeeper, path, ZKUtil.positionToByteArray(100L));
|
||||
}
|
||||
getLastSeqIdOpIndex++;
|
||||
return oldPair;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetAllWALsCversionChange() throws IOException, ReplicationException {
|
||||
ZKReplicationQueueStorage storage = createWithUnstableVersion();
|
||||
storage.addWAL(getServerName(0), "1", "file");
|
||||
// This should return eventually when cversion stabilizes
|
||||
Set<String> allWals = storage.getAllWALs();
|
||||
assertEquals(1, allWals.size());
|
||||
assertThat(allWals, hasItems("file"));
|
||||
}
|
||||
|
||||
// For HBASE-14621
|
||||
@Test
|
||||
public void testGetAllHFileRefsCversionChange() throws IOException, ReplicationException {
|
||||
ZKReplicationQueueStorage storage = createWithUnstableVersion();
|
||||
storage.addPeerToHFileRefs("1");
|
||||
Path p = new Path("/test");
|
||||
storage.addHFileRefs("1", Arrays.asList(Pair.newPair(p, p)));
|
||||
// This should return eventually when cversion stabilizes
|
||||
Set<String> allHFileRefs = storage.getAllHFileRefs();
|
||||
assertEquals(1, allHFileRefs.size());
|
||||
assertThat(allHFileRefs, hasItems("test"));
|
||||
}
|
||||
|
||||
// For HBASE-20138
|
||||
@Test
|
||||
public void testSetWALPositionBadVersion() throws IOException, ReplicationException {
|
||||
ZKReplicationQueueStorage storage = createWithUnstableVersion();
|
||||
ServerName serverName1 = ServerName.valueOf("128.0.0.1", 8000, 10000);
|
||||
assertTrue(storage.getAllQueues(serverName1).isEmpty());
|
||||
String queue1 = "1";
|
||||
String fileName = getFileName("file1", 0);
|
||||
String encodedRegionName = "31d9792f4435b99d9fb1016f6fbc8dc6";
|
||||
storage.addWAL(serverName1, queue1, fileName);
|
||||
|
||||
List<String> wals1 = storage.getWALsInQueue(serverName1, queue1);
|
||||
assertEquals(1, wals1.size());
|
||||
|
||||
assertEquals(0, storage.getWALPosition(serverName1, queue1, fileName));
|
||||
// This should return eventually when data version stabilizes
|
||||
storage.setWALPosition(serverName1, queue1, fileName, 100,
|
||||
ImmutableMap.of(encodedRegionName, 120L));
|
||||
|
||||
assertEquals(100, storage.getWALPosition(serverName1, queue1, fileName));
|
||||
assertEquals(120L, storage.getLastSequenceId(encodedRegionName, queue1));
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRegionsZNodeLayout() throws Exception {
|
||||
String peerId = "1";
|
||||
String encodedRegionName = "31d9792f4435b99d9fb1016f6fbc8dc7";
|
||||
String expectedPath = "/hbase/replication/regions/31/d9/792f4435b99d9fb1016f6fbc8dc7-" + peerId;
|
||||
String path = STORAGE.getSerialReplicationRegionPeerNode(encodedRegionName, peerId);
|
||||
assertEquals(expectedPath, path);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRemoveAllLastPushedSeqIdsForPeer() throws Exception {
|
||||
String peerId = "1";
|
||||
String peerIdToDelete = "2";
|
||||
for (int i = 0; i < 100; i++) {
|
||||
String encodedRegionName = MD5Hash.getMD5AsHex(Bytes.toBytes(i));
|
||||
STORAGE.setLastSequenceIds(peerId, ImmutableMap.of(encodedRegionName, (long) i));
|
||||
STORAGE.setLastSequenceIds(peerIdToDelete, ImmutableMap.of(encodedRegionName, (long) i));
|
||||
}
|
||||
for (int i = 0; i < 100; i++) {
|
||||
String encodedRegionName = MD5Hash.getMD5AsHex(Bytes.toBytes(i));
|
||||
assertEquals(i, STORAGE.getLastSequenceId(encodedRegionName, peerId));
|
||||
assertEquals(i, STORAGE.getLastSequenceId(encodedRegionName, peerIdToDelete));
|
||||
}
|
||||
STORAGE.removeLastSequenceIds(peerIdToDelete);
|
||||
for (int i = 0; i < 100; i++) {
|
||||
String encodedRegionName = MD5Hash.getMD5AsHex(Bytes.toBytes(i));
|
||||
assertEquals(i, STORAGE.getLastSequenceId(encodedRegionName, peerId));
|
||||
assertEquals(HConstants.NO_SEQNUM,
|
||||
STORAGE.getLastSequenceId(encodedRegionName, peerIdToDelete));
|
||||
}
|
||||
}
|
||||
}
|
|
@ -788,8 +788,7 @@ public class HMaster extends HBaseServerBase<MasterRpcServices> implements Maste
|
|||
}
|
||||
this.rsGroupInfoManager = RSGroupInfoManager.create(this);
|
||||
|
||||
this.replicationPeerManager =
|
||||
ReplicationPeerManager.create(fileSystemManager.getFileSystem(), zooKeeper, conf, clusterId);
|
||||
this.replicationPeerManager = ReplicationPeerManager.create(this, clusterId);
|
||||
this.configurationManager.registerObserver(replicationPeerManager);
|
||||
this.replicationPeerModificationStateStore =
|
||||
new ReplicationPeerModificationStateStore(masterRegion);
|
||||
|
|
|
@ -36,7 +36,7 @@ import org.apache.hadoop.hbase.master.SplitWALManager;
|
|||
import org.apache.hadoop.hbase.master.assignment.AssignmentManager;
|
||||
import org.apache.hadoop.hbase.master.assignment.RegionStateNode;
|
||||
import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure;
|
||||
import org.apache.hadoop.hbase.master.replication.ClaimReplicationQueuesProcedure;
|
||||
import org.apache.hadoop.hbase.master.replication.AssignReplicationQueuesProcedure;
|
||||
import org.apache.hadoop.hbase.monitoring.MonitoredTask;
|
||||
import org.apache.hadoop.hbase.monitoring.TaskMonitor;
|
||||
import org.apache.hadoop.hbase.procedure2.Procedure;
|
||||
|
@ -240,15 +240,33 @@ public class ServerCrashProcedure extends
|
|||
}
|
||||
assignRegions(env, regionsOnCrashedServer);
|
||||
}
|
||||
setNextState(ServerCrashState.SERVER_CRASH_CLAIM_REPLICATION_QUEUES);
|
||||
// If there is no replication peer, we do not need to enter the claim queues stage.
|
||||
// This is also very important that now we will later initialize ReplicationQueueStorage
|
||||
// so if there is no replication peer added yet, the storage can not be accessed.
|
||||
// And there will be no race because:
|
||||
// 1. For adding replication peer, if the peer storage has not been updated yet, the crash
|
||||
// region server will not have any replication queues for this peer, so it is safe to skip
|
||||
// claiming.
|
||||
// 2. For removing replication peer, it it has already updated the peer storage, then
|
||||
// there is no way to rollback and region servers are already started to close and delete
|
||||
// replication queues, so it is also safe to skip claiming.
|
||||
if (env.getReplicationPeerManager().listPeers(null).isEmpty()) {
|
||||
setNextState(ServerCrashState.SERVER_CRASH_FINISH);
|
||||
} else {
|
||||
setNextState(ServerCrashState.SERVER_CRASH_CLAIM_REPLICATION_QUEUES);
|
||||
}
|
||||
break;
|
||||
case SERVER_CRASH_HANDLE_RIT2:
|
||||
// Noop. Left in place because we used to call handleRIT here for a second time
|
||||
// but no longer necessary since HBASE-20634.
|
||||
setNextState(ServerCrashState.SERVER_CRASH_CLAIM_REPLICATION_QUEUES);
|
||||
if (env.getReplicationPeerManager().listPeers(null).isEmpty()) {
|
||||
setNextState(ServerCrashState.SERVER_CRASH_FINISH);
|
||||
} else {
|
||||
setNextState(ServerCrashState.SERVER_CRASH_CLAIM_REPLICATION_QUEUES);
|
||||
}
|
||||
break;
|
||||
case SERVER_CRASH_CLAIM_REPLICATION_QUEUES:
|
||||
addChildProcedure(new ClaimReplicationQueuesProcedure(serverName));
|
||||
addChildProcedure(new AssignReplicationQueuesProcedure(serverName));
|
||||
setNextState(ServerCrashState.SERVER_CRASH_FINISH);
|
||||
break;
|
||||
case SERVER_CRASH_FINISH:
|
||||
|
|
|
@ -0,0 +1,204 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.master.replication;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.master.procedure.ServerProcedureInterface;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureUtil;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
|
||||
import org.apache.hadoop.hbase.procedure2.StateMachineProcedure;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationException;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationGroupOffset;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeerDescription;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueId;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
|
||||
import org.apache.hadoop.hbase.util.RetryCounter;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.AssignReplicationQueuesState;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.AssignReplicationQueuesStateData;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos;
|
||||
|
||||
@InterfaceAudience.Private
|
||||
public class AssignReplicationQueuesProcedure
|
||||
extends StateMachineProcedure<MasterProcedureEnv, AssignReplicationQueuesState>
|
||||
implements ServerProcedureInterface {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(AssignReplicationQueuesProcedure.class);
|
||||
|
||||
private ServerName crashedServer;
|
||||
|
||||
private RetryCounter retryCounter;
|
||||
|
||||
public AssignReplicationQueuesProcedure() {
|
||||
}
|
||||
|
||||
public AssignReplicationQueuesProcedure(ServerName crashedServer) {
|
||||
this.crashedServer = crashedServer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ServerName getServerName() {
|
||||
return crashedServer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean hasMetaTableRegion() {
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ServerOperationType getServerOperationType() {
|
||||
return ServerOperationType.CLAIM_REPLICATION_QUEUES;
|
||||
}
|
||||
|
||||
private void addMissingQueues(MasterProcedureEnv env) throws ReplicationException {
|
||||
ReplicationQueueStorage storage = env.getReplicationPeerManager().getQueueStorage();
|
||||
|
||||
Set<String> existingQueuePeerIds = new HashSet<>();
|
||||
List<ReplicationQueueId> queueIds = storage.listAllQueueIds(crashedServer);
|
||||
for (Iterator<ReplicationQueueId> iter = queueIds.iterator(); iter.hasNext();) {
|
||||
ReplicationQueueId queueId = iter.next();
|
||||
if (!queueId.isRecovered()) {
|
||||
existingQueuePeerIds.add(queueId.getPeerId());
|
||||
}
|
||||
}
|
||||
List<ReplicationPeerDescription> peers = env.getReplicationPeerManager().listPeers(null);
|
||||
for (ReplicationPeerDescription peer : peers) {
|
||||
if (!existingQueuePeerIds.contains(peer.getPeerId())) {
|
||||
ReplicationQueueId queueId = new ReplicationQueueId(crashedServer, peer.getPeerId());
|
||||
LOG.debug("Add replication queue {} for claiming", queueId);
|
||||
env.getReplicationPeerManager().getQueueStorage().setOffset(queueId,
|
||||
crashedServer.toString(), ReplicationGroupOffset.BEGIN, Collections.emptyMap());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private Flow claimQueues(MasterProcedureEnv env) throws ReplicationException {
|
||||
ReplicationQueueStorage storage = env.getReplicationPeerManager().getQueueStorage();
|
||||
List<ReplicationQueueId> queueIds = storage.listAllQueueIds(crashedServer);
|
||||
if (queueIds.isEmpty()) {
|
||||
LOG.debug("Finish claiming replication queues for {}", crashedServer);
|
||||
// we are done
|
||||
return Flow.NO_MORE_STATE;
|
||||
}
|
||||
LOG.debug("There are {} replication queues need to be claimed for {}", queueIds.size(),
|
||||
crashedServer);
|
||||
List<ServerName> targetServers =
|
||||
env.getMasterServices().getServerManager().getOnlineServersList();
|
||||
if (targetServers.isEmpty()) {
|
||||
throw new ReplicationException("no region server available");
|
||||
}
|
||||
Collections.shuffle(targetServers);
|
||||
for (int i = 0, n = Math.min(queueIds.size(), targetServers.size()); i < n; i++) {
|
||||
addChildProcedure(
|
||||
new ClaimReplicationQueueRemoteProcedure(queueIds.get(i), targetServers.get(i)));
|
||||
}
|
||||
retryCounter = null;
|
||||
return Flow.HAS_MORE_STATE;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected Flow executeFromState(MasterProcedureEnv env, AssignReplicationQueuesState state)
|
||||
throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException {
|
||||
try {
|
||||
switch (state) {
|
||||
case ASSIGN_REPLICATION_QUEUES_PRE_CHECK:
|
||||
// TODO: reserved for implementing the fencing logic with Add/Remove/UpdatePeerProcedure
|
||||
setNextState(AssignReplicationQueuesState.ASSIGN_REPLICATION_QUEUES_ADD_MISSING_QUEUES);
|
||||
return Flow.HAS_MORE_STATE;
|
||||
case ASSIGN_REPLICATION_QUEUES_ADD_MISSING_QUEUES:
|
||||
addMissingQueues(env);
|
||||
retryCounter = null;
|
||||
setNextState(AssignReplicationQueuesState.ASSIGN_REPLICATION_QUEUES_CLAIM);
|
||||
return Flow.HAS_MORE_STATE;
|
||||
case ASSIGN_REPLICATION_QUEUES_CLAIM:
|
||||
return claimQueues(env);
|
||||
default:
|
||||
throw new UnsupportedOperationException("unhandled state=" + state);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
if (retryCounter == null) {
|
||||
retryCounter = ProcedureUtil.createRetryCounter(env.getMasterConfiguration());
|
||||
}
|
||||
long backoff = retryCounter.getBackoffTimeAndIncrementAttempts();
|
||||
LOG.warn("Failed to claim replication queues for {}, suspend {}secs {}; {};", crashedServer,
|
||||
backoff / 1000, e);
|
||||
setTimeout(Math.toIntExact(backoff));
|
||||
setState(ProcedureProtos.ProcedureState.WAITING_TIMEOUT);
|
||||
skipPersistence();
|
||||
throw new ProcedureSuspendedException();
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
protected synchronized boolean setTimeoutFailure(MasterProcedureEnv env) {
|
||||
setState(ProcedureProtos.ProcedureState.RUNNABLE);
|
||||
env.getProcedureScheduler().addFront(this);
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void rollbackState(MasterProcedureEnv env, AssignReplicationQueuesState state)
|
||||
throws IOException, InterruptedException {
|
||||
throw new UnsupportedOperationException();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected AssignReplicationQueuesState getState(int stateId) {
|
||||
return AssignReplicationQueuesState.forNumber(stateId);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected int getStateId(AssignReplicationQueuesState state) {
|
||||
return state.getNumber();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected AssignReplicationQueuesState getInitialState() {
|
||||
return AssignReplicationQueuesState.ASSIGN_REPLICATION_QUEUES_PRE_CHECK;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException {
|
||||
super.serializeStateData(serializer);
|
||||
serializer.serialize(AssignReplicationQueuesStateData.newBuilder()
|
||||
.setCrashedServer(ProtobufUtil.toServerName(crashedServer)).build());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException {
|
||||
super.deserializeStateData(serializer);
|
||||
AssignReplicationQueuesStateData proto =
|
||||
serializer.deserialize(AssignReplicationQueuesStateData.class);
|
||||
crashedServer = ProtobufUtil.toServerName(proto.getCrashedServer());
|
||||
}
|
||||
|
||||
}
|
|
@ -27,6 +27,7 @@ import org.apache.hadoop.hbase.master.procedure.ServerRemoteProcedure;
|
|||
import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer;
|
||||
import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteOperation;
|
||||
import org.apache.hadoop.hbase.procedure2.RemoteProcedureDispatcher.RemoteProcedure;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueId;
|
||||
import org.apache.hadoop.hbase.replication.regionserver.ClaimReplicationQueueCallable;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
import org.slf4j.Logger;
|
||||
|
@ -43,34 +44,33 @@ public class ClaimReplicationQueueRemoteProcedure extends ServerRemoteProcedure
|
|||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(ClaimReplicationQueueRemoteProcedure.class);
|
||||
|
||||
private ServerName crashedServer;
|
||||
|
||||
private String queue;
|
||||
private ReplicationQueueId queueId;
|
||||
|
||||
public ClaimReplicationQueueRemoteProcedure() {
|
||||
}
|
||||
|
||||
public ClaimReplicationQueueRemoteProcedure(ServerName crashedServer, String queue,
|
||||
ServerName targetServer) {
|
||||
this.crashedServer = crashedServer;
|
||||
this.queue = queue;
|
||||
public ClaimReplicationQueueRemoteProcedure(ReplicationQueueId queueId, ServerName targetServer) {
|
||||
this.queueId = queueId;
|
||||
this.targetServer = targetServer;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Optional<RemoteOperation> remoteCallBuild(MasterProcedureEnv env, ServerName remote) {
|
||||
assert targetServer.equals(remote);
|
||||
ClaimReplicationQueueRemoteParameter.Builder builder = ClaimReplicationQueueRemoteParameter
|
||||
.newBuilder().setCrashedServer(ProtobufUtil.toServerName(queueId.getServerName()))
|
||||
.setQueue(queueId.getPeerId());
|
||||
queueId.getSourceServerName()
|
||||
.ifPresent(sourceServer -> builder.setSourceServer(ProtobufUtil.toServerName(sourceServer)));
|
||||
return Optional.of(new ServerOperation(this, getProcId(), ClaimReplicationQueueCallable.class,
|
||||
ClaimReplicationQueueRemoteParameter.newBuilder()
|
||||
.setCrashedServer(ProtobufUtil.toServerName(crashedServer)).setQueue(queue).build()
|
||||
.toByteArray()));
|
||||
builder.build().toByteArray()));
|
||||
}
|
||||
|
||||
@Override
|
||||
public ServerName getServerName() {
|
||||
// return crashed server here, as we are going to recover its replication queues so we should
|
||||
// use its scheduler queue instead of the one for the target server.
|
||||
return crashedServer;
|
||||
return queueId.getServerName();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -86,8 +86,7 @@ public class ClaimReplicationQueueRemoteProcedure extends ServerRemoteProcedure
|
|||
@Override
|
||||
protected void complete(MasterProcedureEnv env, Throwable error) {
|
||||
if (error != null) {
|
||||
LOG.warn("Failed to claim replication queue {} of crashed server on server {} ", queue,
|
||||
crashedServer, targetServer, error);
|
||||
LOG.warn("Failed to claim replication queue {} on server {} ", queueId, targetServer, error);
|
||||
this.succ = false;
|
||||
} else {
|
||||
this.succ = true;
|
||||
|
@ -111,17 +110,26 @@ public class ClaimReplicationQueueRemoteProcedure extends ServerRemoteProcedure
|
|||
|
||||
@Override
|
||||
protected void serializeStateData(ProcedureStateSerializer serializer) throws IOException {
|
||||
serializer.serialize(ClaimReplicationQueueRemoteStateData.newBuilder()
|
||||
.setCrashedServer(ProtobufUtil.toServerName(crashedServer)).setQueue(queue)
|
||||
.setTargetServer(ProtobufUtil.toServerName(targetServer)).build());
|
||||
ClaimReplicationQueueRemoteStateData.Builder builder = ClaimReplicationQueueRemoteStateData
|
||||
.newBuilder().setCrashedServer(ProtobufUtil.toServerName(queueId.getServerName()))
|
||||
.setQueue(queueId.getPeerId()).setTargetServer(ProtobufUtil.toServerName(targetServer));
|
||||
queueId.getSourceServerName()
|
||||
.ifPresent(sourceServer -> builder.setSourceServer(ProtobufUtil.toServerName(sourceServer)));
|
||||
serializer.serialize(builder.build());
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void deserializeStateData(ProcedureStateSerializer serializer) throws IOException {
|
||||
ClaimReplicationQueueRemoteStateData data =
|
||||
serializer.deserialize(ClaimReplicationQueueRemoteStateData.class);
|
||||
crashedServer = ProtobufUtil.toServerName(data.getCrashedServer());
|
||||
queue = data.getQueue();
|
||||
targetServer = ProtobufUtil.toServerName(data.getTargetServer());
|
||||
ServerName crashedServer = ProtobufUtil.toServerName(data.getCrashedServer());
|
||||
String queue = data.getQueue();
|
||||
if (data.hasSourceServer()) {
|
||||
queueId = new ReplicationQueueId(crashedServer, queue,
|
||||
ProtobufUtil.toServerName(data.getSourceServer()));
|
||||
} else {
|
||||
queueId = new ReplicationQueueId(crashedServer, queue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -19,8 +19,10 @@ package org.apache.hadoop.hbase.master.replication;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Set;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.master.procedure.ServerProcedureInterface;
|
||||
|
@ -30,7 +32,9 @@ import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException;
|
|||
import org.apache.hadoop.hbase.procedure2.ProcedureUtil;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureYieldException;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationException;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueInfo;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationGroupOffset;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeerDescription;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueId;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
|
||||
import org.apache.hadoop.hbase.util.RetryCounter;
|
||||
import org.apache.hadoop.hbase.util.ServerRegionReplicaUtil;
|
||||
|
@ -44,7 +48,10 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos;
|
|||
|
||||
/**
|
||||
* Used to assign the replication queues of a dead server to other region servers.
|
||||
* @deprecated Use {@link AssignReplicationQueuesProcedure} instead, kept only for keeping
|
||||
* compatibility.
|
||||
*/
|
||||
@Deprecated
|
||||
@InterfaceAudience.Private
|
||||
public class ClaimReplicationQueuesProcedure extends Procedure<MasterProcedureEnv>
|
||||
implements ServerProcedureInterface {
|
||||
|
@ -82,22 +89,36 @@ public class ClaimReplicationQueuesProcedure extends Procedure<MasterProcedureEn
|
|||
throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException {
|
||||
ReplicationQueueStorage storage = env.getReplicationPeerManager().getQueueStorage();
|
||||
try {
|
||||
List<String> queues = storage.getAllQueues(crashedServer);
|
||||
List<ReplicationQueueId> queues = storage.listAllQueueIds(crashedServer);
|
||||
Set<String> existQueuePeerIds = new HashSet<>();
|
||||
// this is for upgrading to the new region replication framework, where we will delete the
|
||||
// legacy region_replica_replication peer directly, without deleting the replication queues,
|
||||
// as it may still be used by region servers which have not been upgraded yet.
|
||||
for (Iterator<String> iter = queues.iterator(); iter.hasNext();) {
|
||||
ReplicationQueueInfo queue = new ReplicationQueueInfo(iter.next());
|
||||
if (queue.getPeerId().equals(ServerRegionReplicaUtil.REGION_REPLICA_REPLICATION_PEER)) {
|
||||
// legacy region_replica_replication peer directly, without deleting the replication queues
|
||||
for (Iterator<ReplicationQueueId> iter = queues.iterator(); iter.hasNext();) {
|
||||
ReplicationQueueId queueId = iter.next();
|
||||
if (queueId.getPeerId().equals(ServerRegionReplicaUtil.REGION_REPLICA_REPLICATION_PEER)) {
|
||||
LOG.info("Found replication queue {} for legacy region replication peer, "
|
||||
+ "skipping claiming and removing...", queue.getQueueId());
|
||||
+ "skipping claiming and removing...", queueId);
|
||||
iter.remove();
|
||||
storage.removeQueue(crashedServer, queue.getQueueId());
|
||||
storage.removeQueue(queueId);
|
||||
} else if (!queueId.isRecovered()) {
|
||||
existQueuePeerIds.add(queueId.getPeerId());
|
||||
}
|
||||
}
|
||||
List<ReplicationPeerDescription> peers = env.getReplicationPeerManager().listPeers(null);
|
||||
// TODO: the implementation is not enough yet, if there are retries, we need to know whether
|
||||
// the replication queue for the given peer has been claimed or not, otherwise this logic will
|
||||
// introduce redundant replication queues for the same peer. Add this logic to make some UTs
|
||||
// pass first.
|
||||
for (ReplicationPeerDescription peer : peers) {
|
||||
if (!existQueuePeerIds.contains(peer.getPeerId())) {
|
||||
ReplicationQueueId queueId = new ReplicationQueueId(crashedServer, peer.getPeerId());
|
||||
env.getReplicationPeerManager().getQueueStorage().setOffset(queueId,
|
||||
crashedServer.toString(), ReplicationGroupOffset.BEGIN, Collections.emptyMap());
|
||||
queues.add(queueId);
|
||||
}
|
||||
}
|
||||
if (queues.isEmpty()) {
|
||||
LOG.debug("Finish claiming replication queues for {}", crashedServer);
|
||||
storage.removeReplicatorIfQueueIsEmpty(crashedServer);
|
||||
// we are done
|
||||
return null;
|
||||
}
|
||||
|
@ -112,8 +133,7 @@ public class ClaimReplicationQueuesProcedure extends Procedure<MasterProcedureEn
|
|||
ClaimReplicationQueueRemoteProcedure[] procs =
|
||||
new ClaimReplicationQueueRemoteProcedure[Math.min(queues.size(), targetServers.size())];
|
||||
for (int i = 0; i < procs.length; i++) {
|
||||
procs[i] = new ClaimReplicationQueueRemoteProcedure(crashedServer, queues.get(i),
|
||||
targetServers.get(i));
|
||||
procs[i] = new ClaimReplicationQueueRemoteProcedure(queues.get(i), targetServers.get(i));
|
||||
}
|
||||
return procs;
|
||||
} catch (ReplicationException e) {
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
*/
|
||||
package org.apache.hadoop.hbase.master.replication;
|
||||
|
||||
import com.google.errorprone.annotations.RestrictedApi;
|
||||
import java.io.IOException;
|
||||
import java.net.URI;
|
||||
import java.util.ArrayList;
|
||||
|
@ -29,6 +30,7 @@ import java.util.Set;
|
|||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.concurrent.ConcurrentMap;
|
||||
import java.util.concurrent.Semaphore;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.regex.Pattern;
|
||||
import java.util.stream.Collectors;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
|
@ -38,10 +40,13 @@ import org.apache.hadoop.fs.Path;
|
|||
import org.apache.hadoop.hbase.DoNotRetryIOException;
|
||||
import org.apache.hadoop.hbase.HBaseConfiguration;
|
||||
import org.apache.hadoop.hbase.ReplicationPeerNotFoundException;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.replication.ReplicationPeerConfigUtil;
|
||||
import org.apache.hadoop.hbase.conf.ConfigurationObserver;
|
||||
import org.apache.hadoop.hbase.master.MasterServices;
|
||||
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
|
||||
import org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait;
|
||||
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
|
||||
import org.apache.hadoop.hbase.replication.BaseReplicationEndpoint;
|
||||
import org.apache.hadoop.hbase.replication.HBaseReplicationEndpoint;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationEndpoint;
|
||||
|
@ -50,11 +55,12 @@ import org.apache.hadoop.hbase.replication.ReplicationPeerConfig;
|
|||
import org.apache.hadoop.hbase.replication.ReplicationPeerConfigBuilder;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeerDescription;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeerStorage;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueInfo;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueId;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationStorageFactory;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationUtils;
|
||||
import org.apache.hadoop.hbase.replication.SyncReplicationState;
|
||||
import org.apache.hadoop.hbase.util.Pair;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKClusterId;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKConfig;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
|
||||
|
@ -105,9 +111,20 @@ public class ReplicationPeerManager implements ConfigurationObserver {
|
|||
|
||||
private final ZKWatcher zk;
|
||||
|
||||
@FunctionalInterface
|
||||
private interface ReplicationQueueStorageInitializer {
|
||||
|
||||
void initialize() throws IOException;
|
||||
}
|
||||
|
||||
private final ReplicationQueueStorageInitializer queueStorageInitializer;
|
||||
|
||||
// we will mock this class in UT so leave the constructor as package private and not mark the
|
||||
// class as final, since mockito can not mock a final class
|
||||
ReplicationPeerManager(FileSystem fs, ZKWatcher zk, ReplicationPeerStorage peerStorage,
|
||||
ReplicationQueueStorage queueStorage, ConcurrentMap<String, ReplicationPeerDescription> peers,
|
||||
Configuration conf, String clusterId) {
|
||||
Configuration conf, String clusterId,
|
||||
ReplicationQueueStorageInitializer queueStorageInitializer) {
|
||||
this.fs = fs;
|
||||
this.zk = zk;
|
||||
this.peerStorage = peerStorage;
|
||||
|
@ -115,19 +132,15 @@ public class ReplicationPeerManager implements ConfigurationObserver {
|
|||
this.peers = peers;
|
||||
this.conf = conf;
|
||||
this.clusterId = clusterId;
|
||||
this.queueStorageInitializer = queueStorageInitializer;
|
||||
}
|
||||
|
||||
private void checkQueuesDeleted(String peerId)
|
||||
throws ReplicationException, DoNotRetryIOException {
|
||||
for (ServerName replicator : queueStorage.getListOfReplicators()) {
|
||||
List<String> queueIds = queueStorage.getAllQueues(replicator);
|
||||
for (String queueId : queueIds) {
|
||||
ReplicationQueueInfo queueInfo = new ReplicationQueueInfo(queueId);
|
||||
if (queueInfo.getPeerId().equals(peerId)) {
|
||||
throw new DoNotRetryIOException("undeleted queue for peerId: " + peerId + ", replicator: "
|
||||
+ replicator + ", queueId: " + queueId);
|
||||
}
|
||||
}
|
||||
List<ReplicationQueueId> queueIds = queueStorage.listAllQueueIds(peerId);
|
||||
if (!queueIds.isEmpty()) {
|
||||
throw new DoNotRetryIOException("There are still " + queueIds.size()
|
||||
+ " undeleted queue(s) for peerId: " + peerId + ", first is " + queueIds.get(0));
|
||||
}
|
||||
if (queueStorage.getAllPeersFromHFileRefsQueue().contains(peerId)) {
|
||||
throw new DoNotRetryIOException("Undeleted queue for peer " + peerId + " in hfile-refs");
|
||||
|
@ -135,7 +148,7 @@ public class ReplicationPeerManager implements ConfigurationObserver {
|
|||
}
|
||||
|
||||
void preAddPeer(String peerId, ReplicationPeerConfig peerConfig)
|
||||
throws DoNotRetryIOException, ReplicationException {
|
||||
throws ReplicationException, IOException {
|
||||
if (peerId.contains("-")) {
|
||||
throw new DoNotRetryIOException("Found invalid peer name: " + peerId);
|
||||
}
|
||||
|
@ -146,6 +159,9 @@ public class ReplicationPeerManager implements ConfigurationObserver {
|
|||
if (peers.containsKey(peerId)) {
|
||||
throw new DoNotRetryIOException("Replication peer " + peerId + " already exists");
|
||||
}
|
||||
|
||||
// lazy create table
|
||||
queueStorageInitializer.initialize();
|
||||
// make sure that there is no queues with the same peer id. This may happen when we create a
|
||||
// peer with the same id with a old deleted peer. If the replication queues for the old peer
|
||||
// have not been cleaned up yet then we should not create the new peer, otherwise the old wal
|
||||
|
@ -365,8 +381,8 @@ public class ReplicationPeerManager implements ConfigurationObserver {
|
|||
// claimed once after the refresh peer procedure done(as the next claim queue will just delete
|
||||
// it), so we can make sure that a two pass scan will finally find the queue and remove it,
|
||||
// unless it has already been removed by others.
|
||||
ReplicationUtils.removeAllQueues(queueStorage, peerId);
|
||||
ReplicationUtils.removeAllQueues(queueStorage, peerId);
|
||||
queueStorage.removeAllQueues(peerId);
|
||||
queueStorage.removeAllQueues(peerId);
|
||||
}
|
||||
|
||||
public void removeAllQueuesAndHFileRefs(String peerId) throws ReplicationException {
|
||||
|
@ -568,14 +584,69 @@ public class ReplicationPeerManager implements ConfigurationObserver {
|
|||
.collect(Collectors.toList());
|
||||
}
|
||||
|
||||
@RestrictedApi(explanation = "Should only be called in tests", link = "",
|
||||
allowedOnPath = ".*/src/test/.*")
|
||||
public ReplicationPeerStorage getPeerStorage() {
|
||||
return peerStorage;
|
||||
}
|
||||
|
||||
public ReplicationQueueStorage getQueueStorage() {
|
||||
return queueStorage;
|
||||
}
|
||||
|
||||
public static ReplicationPeerManager create(FileSystem fs, ZKWatcher zk, Configuration conf,
|
||||
String clusterId) throws ReplicationException {
|
||||
private static Pair<ReplicationQueueStorage, ReplicationQueueStorageInitializer>
|
||||
createReplicationQueueStorage(MasterServices services) throws IOException {
|
||||
Configuration conf = services.getConfiguration();
|
||||
TableName replicationQueueTableName =
|
||||
TableName.valueOf(conf.get(ReplicationStorageFactory.REPLICATION_QUEUE_TABLE_NAME,
|
||||
ReplicationStorageFactory.REPLICATION_QUEUE_TABLE_NAME_DEFAULT.getNameAsString()));
|
||||
ReplicationQueueStorageInitializer initializer;
|
||||
if (services.getTableDescriptors().exists(replicationQueueTableName)) {
|
||||
// no need to create the table
|
||||
initializer = () -> {
|
||||
};
|
||||
} else {
|
||||
// lazy create the replication table.
|
||||
initializer = new ReplicationQueueStorageInitializer() {
|
||||
|
||||
private volatile boolean created = false;
|
||||
|
||||
@Override
|
||||
public void initialize() throws IOException {
|
||||
if (created) {
|
||||
return;
|
||||
}
|
||||
synchronized (this) {
|
||||
if (created) {
|
||||
return;
|
||||
}
|
||||
if (services.getTableDescriptors().exists(replicationQueueTableName)) {
|
||||
created = true;
|
||||
return;
|
||||
}
|
||||
long procId = services.createSystemTable(ReplicationStorageFactory
|
||||
.createReplicationQueueTableDescriptor(replicationQueueTableName));
|
||||
ProcedureExecutor<MasterProcedureEnv> procExec = services.getMasterProcedureExecutor();
|
||||
ProcedureSyncWait.waitFor(procExec.getEnvironment(), TimeUnit.MINUTES.toMillis(1),
|
||||
"Creating table " + replicationQueueTableName, () -> procExec.isFinished(procId));
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
return Pair.newPair(ReplicationStorageFactory.getReplicationQueueStorage(
|
||||
services.getConnection(), replicationQueueTableName), initializer);
|
||||
}
|
||||
|
||||
public static ReplicationPeerManager create(MasterServices services, String clusterId)
|
||||
throws ReplicationException, IOException {
|
||||
Configuration conf = services.getConfiguration();
|
||||
FileSystem fs = services.getMasterFileSystem().getFileSystem();
|
||||
ZKWatcher zk = services.getZooKeeper();
|
||||
ReplicationPeerStorage peerStorage =
|
||||
ReplicationStorageFactory.getReplicationPeerStorage(fs, zk, conf);
|
||||
Pair<ReplicationQueueStorage, ReplicationQueueStorageInitializer> pair =
|
||||
createReplicationQueueStorage(services);
|
||||
ReplicationQueueStorage queueStorage = pair.getFirst();
|
||||
ConcurrentMap<String, ReplicationPeerDescription> peers = new ConcurrentHashMap<>();
|
||||
for (String peerId : peerStorage.listPeerIds()) {
|
||||
ReplicationPeerConfig peerConfig = peerStorage.getPeerConfig(peerId);
|
||||
|
@ -585,7 +656,24 @@ public class ReplicationPeerManager implements ConfigurationObserver {
|
|||
) {
|
||||
// we do not use this endpoint for region replication any more, see HBASE-26233
|
||||
LOG.info("Legacy region replication peer found, removing: {}", peerConfig);
|
||||
peerStorage.removePeer(peerId);
|
||||
// do it asynchronous to not block the start up of HMaster
|
||||
new Thread("Remove legacy replication peer " + peerId) {
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
// need to delete two times to make sure we delete all the queues, see the comments in
|
||||
// above
|
||||
// removeAllQueues method for more details.
|
||||
queueStorage.removeAllQueues(peerId);
|
||||
queueStorage.removeAllQueues(peerId);
|
||||
// delete queue first and then peer, because we use peer as a flag.
|
||||
peerStorage.removePeer(peerId);
|
||||
} catch (Exception e) {
|
||||
LOG.warn("Failed to delete legacy replication peer {}", peerId);
|
||||
}
|
||||
}
|
||||
}.start();
|
||||
continue;
|
||||
}
|
||||
peerConfig = ReplicationPeerConfigUtil.updateReplicationBasePeerConfigs(conf, peerConfig);
|
||||
|
@ -594,8 +682,8 @@ public class ReplicationPeerManager implements ConfigurationObserver {
|
|||
SyncReplicationState state = peerStorage.getPeerSyncReplicationState(peerId);
|
||||
peers.put(peerId, new ReplicationPeerDescription(peerId, enabled, peerConfig, state));
|
||||
}
|
||||
return new ReplicationPeerManager(fs, zk, peerStorage,
|
||||
ReplicationStorageFactory.getReplicationQueueStorage(zk, conf), peers, conf, clusterId);
|
||||
return new ReplicationPeerManager(fs, zk, peerStorage, queueStorage, peers, conf, clusterId,
|
||||
pair.getSecond());
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
@ -32,6 +32,7 @@ import java.io.IOException;
|
|||
import java.io.InterruptedIOException;
|
||||
import java.lang.management.MemoryType;
|
||||
import java.net.URLEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Comparator;
|
||||
|
@ -441,8 +442,9 @@ public abstract class AbstractFSWAL<W extends WriterBase> implements WAL {
|
|||
}
|
||||
|
||||
// If prefix is null||empty then just name it wal
|
||||
this.walFilePrefix =
|
||||
prefix == null || prefix.isEmpty() ? "wal" : URLEncoder.encode(prefix, "UTF8");
|
||||
this.walFilePrefix = prefix == null || prefix.isEmpty()
|
||||
? "wal"
|
||||
: URLEncoder.encode(prefix, StandardCharsets.UTF_8.name());
|
||||
// we only correctly differentiate suffices when numeric ones start with '.'
|
||||
if (suffix != null && !(suffix.isEmpty()) && !(suffix.startsWith(WAL_FILE_NAME_DELIMITER))) {
|
||||
throw new IllegalArgumentException("WAL suffix must start with '" + WAL_FILE_NAME_DELIMITER
|
||||
|
|
|
@ -19,23 +19,28 @@ package org.apache.hadoop.hbase.replication.master;
|
|||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.commons.io.IOUtils;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.hbase.HBaseInterfaceAudience;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.Server;
|
||||
import org.apache.hadoop.hbase.client.Connection;
|
||||
import org.apache.hadoop.hbase.client.ConnectionFactory;
|
||||
import org.apache.hadoop.hbase.master.HMaster;
|
||||
import org.apache.hadoop.hbase.master.cleaner.BaseHFileCleanerDelegate;
|
||||
import org.apache.hadoop.hbase.master.cleaner.HFileCleaner;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationException;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationStorageFactory;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.common.base.Predicate;
|
||||
import org.apache.hbase.thirdparty.com.google.common.collect.Iterables;
|
||||
import org.apache.hbase.thirdparty.org.apache.commons.collections4.MapUtils;
|
||||
|
||||
/**
|
||||
* Implementation of a file cleaner that checks if a hfile is still scheduled for replication before
|
||||
|
@ -44,15 +49,20 @@ import org.apache.hbase.thirdparty.com.google.common.collect.Iterables;
|
|||
@InterfaceAudience.LimitedPrivate(HBaseInterfaceAudience.CONFIG)
|
||||
public class ReplicationHFileCleaner extends BaseHFileCleanerDelegate {
|
||||
private static final Logger LOG = LoggerFactory.getLogger(ReplicationHFileCleaner.class);
|
||||
private ZKWatcher zkw;
|
||||
private Connection conn;
|
||||
private boolean shareConn;
|
||||
private ReplicationQueueStorage rqs;
|
||||
private boolean stopped = false;
|
||||
|
||||
@Override
|
||||
public Iterable<FileStatus> getDeletableFiles(Iterable<FileStatus> files) {
|
||||
// all members of this class are null if replication is disabled,
|
||||
// so we cannot filter the files
|
||||
if (this.getConf() == null) {
|
||||
if (
|
||||
!(getConf().getBoolean(HConstants.REPLICATION_BULKLOAD_ENABLE_KEY,
|
||||
HConstants.REPLICATION_BULKLOAD_ENABLE_DEFAULT))
|
||||
) {
|
||||
LOG.warn(HConstants.REPLICATION_BULKLOAD_ENABLE_KEY + " is not enabled. Better to remove "
|
||||
+ ReplicationHFileCleaner.class + " from " + HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS
|
||||
+ " configuration.");
|
||||
return files;
|
||||
}
|
||||
|
||||
|
@ -88,51 +98,34 @@ public class ReplicationHFileCleaner extends BaseHFileCleanerDelegate {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void setConf(Configuration config) {
|
||||
// If either replication or replication of bulk load hfiles is disabled, keep all members null
|
||||
if (
|
||||
!(config.getBoolean(HConstants.REPLICATION_BULKLOAD_ENABLE_KEY,
|
||||
HConstants.REPLICATION_BULKLOAD_ENABLE_DEFAULT))
|
||||
) {
|
||||
LOG.warn(HConstants.REPLICATION_BULKLOAD_ENABLE_KEY + " is not enabled. Better to remove "
|
||||
+ ReplicationHFileCleaner.class + " from " + HFileCleaner.MASTER_HFILE_CLEANER_PLUGINS
|
||||
+ " configuration.");
|
||||
return;
|
||||
}
|
||||
// Make my own Configuration. Then I'll have my own connection to zk that
|
||||
// I can close myself when time comes.
|
||||
Configuration conf = new Configuration(config);
|
||||
public void init(Map<String, Object> params) {
|
||||
super.init(params);
|
||||
try {
|
||||
setConf(conf, new ZKWatcher(conf, "replicationHFileCleaner", null));
|
||||
if (MapUtils.isNotEmpty(params)) {
|
||||
Object master = params.get(HMaster.MASTER);
|
||||
if (master != null && master instanceof Server) {
|
||||
conn = ((Server) master).getConnection();
|
||||
shareConn = true;
|
||||
}
|
||||
}
|
||||
if (conn == null) {
|
||||
conn = ConnectionFactory.createConnection(getConf());
|
||||
}
|
||||
this.rqs = ReplicationStorageFactory.getReplicationQueueStorage(conn, getConf());
|
||||
} catch (IOException e) {
|
||||
LOG.error("Error while configuring " + this.getClass().getName(), e);
|
||||
}
|
||||
}
|
||||
|
||||
@InterfaceAudience.Private
|
||||
public void setConf(Configuration conf, ZKWatcher zk) {
|
||||
super.setConf(conf);
|
||||
try {
|
||||
initReplicationQueueStorage(conf, zk);
|
||||
} catch (Exception e) {
|
||||
LOG.error("Error while configuring " + this.getClass().getName(), e);
|
||||
}
|
||||
}
|
||||
|
||||
private void initReplicationQueueStorage(Configuration conf, ZKWatcher zk) {
|
||||
this.zkw = zk;
|
||||
this.rqs = ReplicationStorageFactory.getReplicationQueueStorage(zk, conf);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stop(String why) {
|
||||
if (this.stopped) {
|
||||
return;
|
||||
}
|
||||
this.stopped = true;
|
||||
if (this.zkw != null) {
|
||||
LOG.info("Stopping " + this.zkw);
|
||||
this.zkw.close();
|
||||
if (!shareConn && this.conn != null) {
|
||||
LOG.info("Stopping " + this.conn);
|
||||
IOUtils.closeQuietly(conn);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -26,9 +26,7 @@ import org.apache.hadoop.fs.FileStatus;
|
|||
import org.apache.hadoop.hbase.HBaseInterfaceAudience;
|
||||
import org.apache.hadoop.hbase.master.HMaster;
|
||||
import org.apache.hadoop.hbase.master.cleaner.BaseLogCleanerDelegate;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationException;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationStorageFactory;
|
||||
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
|
@ -56,14 +54,15 @@ public class ReplicationLogCleaner extends BaseLogCleanerDelegate {
|
|||
@Override
|
||||
public void preClean() {
|
||||
readZKTimestamp = EnvironmentEdgeManager.currentTime();
|
||||
try {
|
||||
// The concurrently created new WALs may not be included in the return list,
|
||||
// but they won't be deleted because they're not in the checking set.
|
||||
wals = queueStorage.getAllWALs();
|
||||
} catch (ReplicationException e) {
|
||||
LOG.warn("Failed to read zookeeper, skipping checking deletable files");
|
||||
wals = null;
|
||||
}
|
||||
// TODO: revisit the implementation
|
||||
// try {
|
||||
// // The concurrently created new WALs may not be included in the return list,
|
||||
// // but they won't be deleted because they're not in the checking set.
|
||||
// wals = queueStorage.getAllWALs();
|
||||
// } catch (ReplicationException e) {
|
||||
// LOG.warn("Failed to read zookeeper, skipping checking deletable files");
|
||||
// wals = null;
|
||||
// }
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -115,7 +114,8 @@ public class ReplicationLogCleaner extends BaseLogCleanerDelegate {
|
|||
if (zkw == null) {
|
||||
zkw = new ZKWatcher(getConf(), "replicationLogCleaner", null);
|
||||
}
|
||||
this.queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(zkw, getConf());
|
||||
// TODO: revisit the implementation
|
||||
// this.queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(zkw, getConf());
|
||||
} catch (IOException e) {
|
||||
LOG.error("Error while configuring " + this.getClass().getName(), e);
|
||||
}
|
||||
|
@ -126,7 +126,8 @@ public class ReplicationLogCleaner extends BaseLogCleanerDelegate {
|
|||
super.setConf(conf);
|
||||
try {
|
||||
this.zkw = zk;
|
||||
this.queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(zk, conf);
|
||||
// TODO: revisit the implementation
|
||||
// this.queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(zk, conf);
|
||||
} catch (Exception e) {
|
||||
LOG.error("Error while configuring " + this.getClass().getName(), e);
|
||||
}
|
||||
|
|
|
@ -20,6 +20,7 @@ package org.apache.hadoop.hbase.replication.regionserver;
|
|||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.executor.EventType;
|
||||
import org.apache.hadoop.hbase.procedure2.BaseRSProcedureCallable;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueId;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.protobuf.InvalidProtocolBufferException;
|
||||
|
@ -30,9 +31,7 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProcedureProtos.C
|
|||
@InterfaceAudience.Private
|
||||
public class ClaimReplicationQueueCallable extends BaseRSProcedureCallable {
|
||||
|
||||
private ServerName crashedServer;
|
||||
|
||||
private String queue;
|
||||
private ReplicationQueueId queueId;
|
||||
|
||||
@Override
|
||||
public EventType getEventType() {
|
||||
|
@ -42,14 +41,20 @@ public class ClaimReplicationQueueCallable extends BaseRSProcedureCallable {
|
|||
@Override
|
||||
protected void doCall() throws Exception {
|
||||
PeerProcedureHandler handler = rs.getReplicationSourceService().getPeerProcedureHandler();
|
||||
handler.claimReplicationQueue(crashedServer, queue);
|
||||
handler.claimReplicationQueue(queueId);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void initParameter(byte[] parameter) throws InvalidProtocolBufferException {
|
||||
ClaimReplicationQueueRemoteParameter param =
|
||||
ClaimReplicationQueueRemoteParameter.parseFrom(parameter);
|
||||
crashedServer = ProtobufUtil.toServerName(param.getCrashedServer());
|
||||
queue = param.getQueue();
|
||||
ServerName crashedServer = ProtobufUtil.toServerName(param.getCrashedServer());
|
||||
String queue = param.getQueue();
|
||||
if (param.hasSourceServer()) {
|
||||
ServerName sourceServer = ProtobufUtil.toServerName(param.getSourceServer());
|
||||
queueId = new ReplicationQueueId(crashedServer, queue, sourceServer);
|
||||
} else {
|
||||
queueId = new ReplicationQueueId(crashedServer, queue);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -21,7 +21,6 @@ import java.io.FileNotFoundException;
|
|||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.Collections;
|
||||
import java.util.LinkedList;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
|
@ -45,10 +44,8 @@ import org.apache.hadoop.hbase.replication.ReplicationPeerConfig;
|
|||
import org.apache.hadoop.hbase.replication.ReplicationPeerDescription;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueInfo;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationStorageFactory;
|
||||
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKDump;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKUtil;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
|
||||
import org.apache.hadoop.util.Tool;
|
||||
import org.apache.hadoop.util.ToolRunner;
|
||||
|
@ -59,6 +56,8 @@ import org.slf4j.LoggerFactory;
|
|||
import org.apache.hbase.thirdparty.com.google.common.util.concurrent.AtomicLongMap;
|
||||
|
||||
/**
|
||||
* TODO: reimplement this tool
|
||||
* <p/>
|
||||
* Provides information about the existing states of replication, replication peers and queues.
|
||||
* Usage: hbase org.apache.hadoop.hbase.replication.regionserver.DumpReplicationQueues [args]
|
||||
* Arguments: --distributed Polls each RS to dump information about the queue --hdfs Reports HDFS
|
||||
|
@ -299,32 +298,33 @@ public class DumpReplicationQueues extends Configured implements Tool {
|
|||
ReplicationQueueStorage queueStorage;
|
||||
StringBuilder sb = new StringBuilder();
|
||||
|
||||
queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(zkw, getConf());
|
||||
Set<ServerName> liveRegionServers = ZKUtil.listChildrenNoWatch(zkw, zkw.getZNodePaths().rsZNode)
|
||||
.stream().map(ServerName::parseServerName).collect(Collectors.toSet());
|
||||
|
||||
// queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(zkw, getConf());
|
||||
// Set<ServerName> liveRegionServers = ZKUtil.listChildrenNoWatch(zkw,
|
||||
// zkw.getZNodePaths().rsZNode)
|
||||
// .stream().map(ServerName::parseServerName).collect(Collectors.toSet());
|
||||
//
|
||||
// Loops each peer on each RS and dumps the queues
|
||||
List<ServerName> regionservers = queueStorage.getListOfReplicators();
|
||||
if (regionservers == null || regionservers.isEmpty()) {
|
||||
return sb.toString();
|
||||
}
|
||||
for (ServerName regionserver : regionservers) {
|
||||
List<String> queueIds = queueStorage.getAllQueues(regionserver);
|
||||
if (!liveRegionServers.contains(regionserver)) {
|
||||
deadRegionServers.add(regionserver.getServerName());
|
||||
}
|
||||
for (String queueId : queueIds) {
|
||||
ReplicationQueueInfo queueInfo = new ReplicationQueueInfo(queueId);
|
||||
List<String> wals = queueStorage.getWALsInQueue(regionserver, queueId);
|
||||
Collections.sort(wals);
|
||||
if (!peerIds.contains(queueInfo.getPeerId())) {
|
||||
deletedQueues.add(regionserver + "/" + queueId);
|
||||
sb.append(formatQueue(regionserver, queueStorage, queueInfo, queueId, wals, true, hdfs));
|
||||
} else {
|
||||
sb.append(formatQueue(regionserver, queueStorage, queueInfo, queueId, wals, false, hdfs));
|
||||
}
|
||||
}
|
||||
}
|
||||
// List<ServerName> regionservers = queueStorage.getListOfReplicators();
|
||||
// if (regionservers == null || regionservers.isEmpty()) {
|
||||
// return sb.toString();
|
||||
// }
|
||||
// for (ServerName regionserver : regionservers) {
|
||||
// List<String> queueIds = queueStorage.getAllQueues(regionserver);
|
||||
// if (!liveRegionServers.contains(regionserver)) {
|
||||
// deadRegionServers.add(regionserver.getServerName());
|
||||
// }
|
||||
// for (String queueId : queueIds) {
|
||||
// ReplicationQueueInfo queueInfo = new ReplicationQueueInfo(queueId);
|
||||
// List<String> wals = queueStorage.getWALsInQueue(regionserver, queueId);
|
||||
// Collections.sort(wals);
|
||||
// if (!peerIds.contains(queueInfo.getPeerId())) {
|
||||
// deletedQueues.add(regionserver + "/" + queueId);
|
||||
// sb.append(formatQueue(regionserver, queueStorage, queueInfo, queueId, wals, true, hdfs));
|
||||
// } else {
|
||||
// sb.append(formatQueue(regionserver, queueStorage, queueInfo, queueId, wals, false, hdfs));
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
return sb.toString();
|
||||
}
|
||||
|
||||
|
@ -350,9 +350,9 @@ public class DumpReplicationQueues extends Configured implements Tool {
|
|||
peersQueueSize.addAndGet(queueInfo.getPeerId(), wals.size());
|
||||
|
||||
for (String wal : wals) {
|
||||
long position = queueStorage.getWALPosition(regionserver, queueInfo.getPeerId(), wal);
|
||||
sb.append(" Replication position for " + wal + ": "
|
||||
+ (position > 0 ? position : "0" + " (not started or nothing to replicate)") + "\n");
|
||||
// long position = queueStorage.getWALPosition(regionserver, queueInfo.getPeerId(), wal);
|
||||
// sb.append(" Replication position for " + wal + ": "
|
||||
// + (position > 0 ? position : "0" + " (not started or nothing to replicate)") + "\n");
|
||||
}
|
||||
|
||||
if (hdfs) {
|
||||
|
|
|
@ -18,9 +18,9 @@
|
|||
package org.apache.hadoop.hbase.replication.regionserver;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.regionserver.HRegionServer;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationException;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueId;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
|
||||
/**
|
||||
|
@ -42,6 +42,5 @@ public interface PeerProcedureHandler {
|
|||
void transitSyncReplicationPeerState(String peerId, int stage, HRegionServer rs)
|
||||
throws ReplicationException, IOException;
|
||||
|
||||
void claimReplicationQueue(ServerName crashedServer, String queue)
|
||||
throws ReplicationException, IOException;
|
||||
void claimReplicationQueue(ReplicationQueueId queueId) throws ReplicationException, IOException;
|
||||
}
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.hadoop.hbase.replication.regionserver;
|
|||
import java.io.IOException;
|
||||
import java.io.InterruptedIOException;
|
||||
import java.util.concurrent.locks.Lock;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.regionserver.HRegionServer;
|
||||
import org.apache.hadoop.hbase.regionserver.LogRoller;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationException;
|
||||
|
@ -28,6 +27,7 @@ import org.apache.hadoop.hbase.replication.ReplicationPeer.PeerState;
|
|||
import org.apache.hadoop.hbase.replication.ReplicationPeerConfig;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeerImpl;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeers;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueId;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationUtils;
|
||||
import org.apache.hadoop.hbase.replication.SyncReplicationState;
|
||||
import org.apache.hadoop.hbase.util.KeyLocker;
|
||||
|
@ -226,8 +226,8 @@ public class PeerProcedureHandlerImpl implements PeerProcedureHandler {
|
|||
}
|
||||
|
||||
@Override
|
||||
public void claimReplicationQueue(ServerName crashedServer, String queue)
|
||||
public void claimReplicationQueue(ReplicationQueueId queueId)
|
||||
throws ReplicationException, IOException {
|
||||
replicationSourceManager.claimQueue(crashedServer, queue);
|
||||
replicationSourceManager.claimQueue(queueId);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,23 +17,7 @@
|
|||
*/
|
||||
package org.apache.hadoop.hbase.replication.regionserver;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.List;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.PriorityBlockingQueue;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.Server;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeer;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
|
||||
import org.apache.hadoop.hbase.util.CommonFSUtils;
|
||||
import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Class that handles the recovered source of a replication stream, which is transfered from another
|
||||
|
@ -42,124 +26,14 @@ import org.slf4j.LoggerFactory;
|
|||
@InterfaceAudience.Private
|
||||
public class RecoveredReplicationSource extends ReplicationSource {
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(RecoveredReplicationSource.class);
|
||||
|
||||
private String actualPeerId;
|
||||
|
||||
@Override
|
||||
public void init(Configuration conf, FileSystem fs, ReplicationSourceManager manager,
|
||||
ReplicationQueueStorage queueStorage, ReplicationPeer replicationPeer, Server server,
|
||||
String peerClusterZnode, UUID clusterId, WALFileLengthProvider walFileLengthProvider,
|
||||
MetricsSource metrics) throws IOException {
|
||||
super.init(conf, fs, manager, queueStorage, replicationPeer, server, peerClusterZnode,
|
||||
clusterId, walFileLengthProvider, metrics);
|
||||
this.actualPeerId = this.replicationQueueInfo.getPeerId();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected RecoveredReplicationSourceShipper createNewShipper(String walGroupId) {
|
||||
return new RecoveredReplicationSourceShipper(conf, walGroupId, logQueue, this, queueStorage);
|
||||
}
|
||||
|
||||
public void locateRecoveredPaths(String walGroupId) throws IOException {
|
||||
boolean hasPathChanged = false;
|
||||
PriorityBlockingQueue<Path> queue = logQueue.getQueue(walGroupId);
|
||||
PriorityBlockingQueue<Path> newPaths = new PriorityBlockingQueue<Path>(queueSizePerGroup,
|
||||
new AbstractFSWALProvider.WALStartTimeComparator());
|
||||
pathsLoop: for (Path path : queue) {
|
||||
if (fs.exists(path)) { // still in same location, don't need to do anything
|
||||
newPaths.add(path);
|
||||
continue;
|
||||
}
|
||||
// Path changed - try to find the right path.
|
||||
hasPathChanged = true;
|
||||
if (server instanceof ReplicationSyncUp.DummyServer) {
|
||||
// In the case of disaster/recovery, HMaster may be shutdown/crashed before flush data
|
||||
// from .logs to .oldlogs. Loop into .logs folders and check whether a match exists
|
||||
Path newPath = getReplSyncUpPath(path);
|
||||
newPaths.add(newPath);
|
||||
continue;
|
||||
} else {
|
||||
// See if Path exists in the dead RS folder (there could be a chain of failures
|
||||
// to look at)
|
||||
List<ServerName> deadRegionServers = this.replicationQueueInfo.getDeadRegionServers();
|
||||
LOG.info("NB dead servers : " + deadRegionServers.size());
|
||||
final Path walDir = CommonFSUtils.getWALRootDir(conf);
|
||||
for (ServerName curDeadServerName : deadRegionServers) {
|
||||
final Path deadRsDirectory = new Path(walDir,
|
||||
AbstractFSWALProvider.getWALDirectoryName(curDeadServerName.getServerName()));
|
||||
Path[] locs = new Path[] { new Path(deadRsDirectory, path.getName()),
|
||||
new Path(deadRsDirectory.suffix(AbstractFSWALProvider.SPLITTING_EXT), path.getName()) };
|
||||
for (Path possibleLogLocation : locs) {
|
||||
LOG.info("Possible location " + possibleLogLocation.toUri().toString());
|
||||
if (manager.getFs().exists(possibleLogLocation)) {
|
||||
// We found the right new location
|
||||
LOG.info("Log " + path + " still exists at " + possibleLogLocation);
|
||||
newPaths.add(possibleLogLocation);
|
||||
continue pathsLoop;
|
||||
}
|
||||
}
|
||||
return new RecoveredReplicationSourceShipper(conf, walGroupId, logQueue, this, queueStorage,
|
||||
() -> {
|
||||
if (workerThreads.isEmpty()) {
|
||||
this.getSourceMetrics().clear();
|
||||
manager.finishRecoveredSource(this);
|
||||
}
|
||||
// didn't find a new location
|
||||
LOG.error(
|
||||
String.format("WAL Path %s doesn't exist and couldn't find its new location", path));
|
||||
newPaths.add(path);
|
||||
}
|
||||
}
|
||||
|
||||
if (hasPathChanged) {
|
||||
if (newPaths.size() != queue.size()) { // this shouldn't happen
|
||||
LOG.error("Recovery queue size is incorrect");
|
||||
throw new IOException("Recovery queue size error");
|
||||
}
|
||||
// put the correct locations in the queue
|
||||
// since this is a recovered queue with no new incoming logs,
|
||||
// there shouldn't be any concurrency issues
|
||||
logQueue.clear(walGroupId);
|
||||
for (Path path : newPaths) {
|
||||
logQueue.enqueueLog(path, walGroupId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// N.B. the ReplicationSyncUp tool sets the manager.getWALDir to the root of the wal
|
||||
// area rather than to the wal area for a particular region server.
|
||||
private Path getReplSyncUpPath(Path path) throws IOException {
|
||||
FileStatus[] rss = fs.listStatus(manager.getLogDir());
|
||||
for (FileStatus rs : rss) {
|
||||
Path p = rs.getPath();
|
||||
FileStatus[] logs = fs.listStatus(p);
|
||||
for (FileStatus log : logs) {
|
||||
p = new Path(p, log.getPath().getName());
|
||||
if (p.getName().equals(path.getName())) {
|
||||
LOG.info("Log " + p.getName() + " found at " + p);
|
||||
return p;
|
||||
}
|
||||
}
|
||||
}
|
||||
LOG.error("Didn't find path for: " + path.getName());
|
||||
return path;
|
||||
}
|
||||
|
||||
void tryFinish() {
|
||||
if (workerThreads.isEmpty()) {
|
||||
this.getSourceMetrics().clear();
|
||||
manager.finishRecoveredSource(this);
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getPeerId() {
|
||||
return this.actualPeerId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public ServerName getServerWALsBelongTo() {
|
||||
return this.replicationQueueInfo.getDeadRegionServers().get(0);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean isRecovered() {
|
||||
return true;
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
|
@ -17,83 +17,27 @@
|
|||
*/
|
||||
package org.apache.hadoop.hbase.replication.regionserver;
|
||||
|
||||
import java.io.IOException;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationException;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
|
||||
import org.apache.hadoop.hbase.util.Threads;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
/**
|
||||
* Used by a {@link RecoveredReplicationSource}.
|
||||
*/
|
||||
@InterfaceAudience.Private
|
||||
public class RecoveredReplicationSourceShipper extends ReplicationSourceShipper {
|
||||
private static final Logger LOG =
|
||||
LoggerFactory.getLogger(RecoveredReplicationSourceShipper.class);
|
||||
|
||||
protected final RecoveredReplicationSource source;
|
||||
private final ReplicationQueueStorage replicationQueues;
|
||||
private final Runnable tryFinish;
|
||||
|
||||
public RecoveredReplicationSourceShipper(Configuration conf, String walGroupId,
|
||||
ReplicationSourceLogQueue logQueue, RecoveredReplicationSource source,
|
||||
ReplicationQueueStorage queueStorage) {
|
||||
ReplicationQueueStorage queueStorage, Runnable tryFinish) {
|
||||
super(conf, walGroupId, logQueue, source);
|
||||
this.source = source;
|
||||
this.replicationQueues = queueStorage;
|
||||
this.tryFinish = tryFinish;
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void postFinish() {
|
||||
source.tryFinish();
|
||||
}
|
||||
|
||||
@Override
|
||||
public long getStartPosition() {
|
||||
long startPosition = getRecoveredQueueStartPos();
|
||||
int numRetries = 0;
|
||||
while (numRetries <= maxRetriesMultiplier) {
|
||||
try {
|
||||
source.locateRecoveredPaths(walGroupId);
|
||||
break;
|
||||
} catch (IOException e) {
|
||||
LOG.error("Error while locating recovered queue paths, attempt #" + numRetries, e);
|
||||
numRetries++;
|
||||
}
|
||||
}
|
||||
return startPosition;
|
||||
}
|
||||
|
||||
// If this is a recovered queue, the queue is already full and the first log
|
||||
// normally has a position (unless the RS failed between 2 logs)
|
||||
private long getRecoveredQueueStartPos() {
|
||||
long startPosition = 0;
|
||||
String peerClusterZNode = source.getQueueId();
|
||||
try {
|
||||
startPosition = this.replicationQueues.getWALPosition(source.getServer().getServerName(),
|
||||
peerClusterZNode, this.logQueue.getQueue(walGroupId).peek().getName());
|
||||
LOG.trace("Recovered queue started with log {} at position {}",
|
||||
this.logQueue.getQueue(walGroupId).peek(), startPosition);
|
||||
} catch (ReplicationException e) {
|
||||
terminate("Couldn't get the position of this recovered queue " + peerClusterZNode, e);
|
||||
}
|
||||
return startPosition;
|
||||
}
|
||||
|
||||
private void terminate(String reason, Exception cause) {
|
||||
if (cause == null) {
|
||||
LOG.info("Closing worker for wal group {} because: {}", this.walGroupId, reason);
|
||||
} else {
|
||||
LOG.error(
|
||||
"Closing worker for wal group " + this.walGroupId + " because an error occurred: " + reason,
|
||||
cause);
|
||||
}
|
||||
entryReader.interrupt();
|
||||
Threads.shutdown(entryReader, sleepForRetries);
|
||||
this.interrupt();
|
||||
Threads.shutdown(this, sleepForRetries);
|
||||
LOG.info("ReplicationSourceWorker {} terminated", this.getName());
|
||||
tryFinish.run();
|
||||
}
|
||||
}
|
||||
|
|
|
@ -100,7 +100,7 @@ public class Replication implements ReplicationSourceService, PropagatingConfigu
|
|||
|
||||
try {
|
||||
this.queueStorage =
|
||||
ReplicationStorageFactory.getReplicationQueueStorage(server.getZooKeeper(), conf);
|
||||
ReplicationStorageFactory.getReplicationQueueStorage(server.getConnection(), conf);
|
||||
this.replicationPeers = ReplicationFactory.getReplicationPeers(server.getFileSystem(),
|
||||
server.getZooKeeper(), this.conf);
|
||||
this.replicationPeers.init();
|
||||
|
|
|
@ -89,8 +89,8 @@ public class ReplicationLoad {
|
|||
rLoadSourceBuild.setOPsShipped(oPsShipped);
|
||||
if (source instanceof ReplicationSource) {
|
||||
ReplicationSource replSource = (ReplicationSource) source;
|
||||
rLoadSourceBuild.setRecovered(replSource.getReplicationQueueInfo().isQueueRecovered());
|
||||
rLoadSourceBuild.setQueueId(replSource.getReplicationQueueInfo().getQueueId());
|
||||
rLoadSourceBuild.setRecovered(replSource.getQueueId().isRecovered());
|
||||
rLoadSourceBuild.setQueueId(replSource.getQueueId().toString());
|
||||
rLoadSourceBuild.setRunning(replSource.isWorkerRunning());
|
||||
rLoadSourceBuild.setEditsSinceRestart(timeStampOfNextToReplicate > 0);
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@ package org.apache.hadoop.hbase.replication.regionserver;
|
|||
|
||||
import static org.apache.hadoop.hbase.wal.AbstractFSWALProvider.findArchivedLog;
|
||||
|
||||
import com.google.errorprone.annotations.RestrictedApi;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.lang.reflect.InvocationTargetException;
|
||||
|
@ -52,8 +53,10 @@ import org.apache.hadoop.hbase.replication.ChainWALEntryFilter;
|
|||
import org.apache.hadoop.hbase.replication.ClusterMarkingEntryFilter;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationEndpoint;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationException;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationGroupOffset;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeer;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueInfo;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueData;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueId;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
|
||||
import org.apache.hadoop.hbase.replication.SystemTableWALEntryFilter;
|
||||
import org.apache.hadoop.hbase.replication.WALEntryFilter;
|
||||
|
@ -67,6 +70,7 @@ import org.apache.yetus.audience.InterfaceAudience;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap;
|
||||
import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
|
||||
|
||||
/**
|
||||
|
@ -90,7 +94,6 @@ public class ReplicationSource implements ReplicationSourceInterface {
|
|||
protected ReplicationPeer replicationPeer;
|
||||
|
||||
protected Configuration conf;
|
||||
protected ReplicationQueueInfo replicationQueueInfo;
|
||||
|
||||
// The manager of all sources to which we ping back our progress
|
||||
protected ReplicationSourceManager manager;
|
||||
|
@ -103,8 +106,11 @@ public class ReplicationSource implements ReplicationSourceInterface {
|
|||
private UUID clusterId;
|
||||
// total number of edits we replicated
|
||||
private AtomicLong totalReplicatedEdits = new AtomicLong(0);
|
||||
// The znode we currently play with
|
||||
protected String queueId;
|
||||
// The id of the replication queue
|
||||
protected ReplicationQueueId queueId;
|
||||
// The start offsets. Usually only recovered replication queue needs this, but probably when we
|
||||
// update the peer config and restart the replication peer, we also need this?
|
||||
protected ImmutableMap<String, ReplicationGroupOffset> startOffsets;
|
||||
// Maximum number of retries before taking bold actions
|
||||
private int maxRetriesMultiplier;
|
||||
// Indicates if this particular source is running
|
||||
|
@ -184,14 +190,14 @@ public class ReplicationSource implements ReplicationSourceInterface {
|
|||
* @param fs file system to use
|
||||
* @param manager replication manager to ping to
|
||||
* @param server the server for this region server
|
||||
* @param queueId the id of our replication queue
|
||||
* @param queueData the id and offsets of our replication queue
|
||||
* @param clusterId unique UUID for the cluster
|
||||
* @param metrics metrics for replication source
|
||||
*/
|
||||
@Override
|
||||
public void init(Configuration conf, FileSystem fs, ReplicationSourceManager manager,
|
||||
ReplicationQueueStorage queueStorage, ReplicationPeer replicationPeer, Server server,
|
||||
String queueId, UUID clusterId, WALFileLengthProvider walFileLengthProvider,
|
||||
ReplicationQueueData queueData, UUID clusterId, WALFileLengthProvider walFileLengthProvider,
|
||||
MetricsSource metrics) throws IOException {
|
||||
this.server = server;
|
||||
this.conf = HBaseConfiguration.create(conf);
|
||||
|
@ -211,8 +217,8 @@ public class ReplicationSource implements ReplicationSourceInterface {
|
|||
this.metrics = metrics;
|
||||
this.clusterId = clusterId;
|
||||
|
||||
this.queueId = queueId;
|
||||
this.replicationQueueInfo = new ReplicationQueueInfo(queueId);
|
||||
this.queueId = queueData.getId();
|
||||
this.startOffsets = queueData.getOffsets();
|
||||
|
||||
// A defaultBandwidth of '0' means no bandwidth; i.e. no throttling.
|
||||
defaultBandwidth = this.conf.getLong("replication.source.per.peer.node.bandwidth", 0);
|
||||
|
@ -240,24 +246,24 @@ public class ReplicationSource implements ReplicationSourceInterface {
|
|||
return;
|
||||
}
|
||||
// Use WAL prefix as the WALGroupId for this peer.
|
||||
String walPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(wal.getName());
|
||||
boolean queueExists = logQueue.enqueueLog(wal, walPrefix);
|
||||
String walGroupId = AbstractFSWALProvider.getWALPrefixFromWALName(wal.getName());
|
||||
boolean queueExists = logQueue.enqueueLog(wal, walGroupId);
|
||||
|
||||
if (!queueExists) {
|
||||
if (this.isSourceActive() && this.walEntryFilter != null) {
|
||||
// new wal group observed after source startup, start a new worker thread to track it
|
||||
// notice: it's possible that wal enqueued when this.running is set but worker thread
|
||||
// still not launched, so it's necessary to check workerThreads before start the worker
|
||||
tryStartNewShipper(walPrefix);
|
||||
tryStartNewShipper(walGroupId);
|
||||
}
|
||||
}
|
||||
if (LOG.isTraceEnabled()) {
|
||||
LOG.trace("{} Added wal {} to queue of source {}.", logPeerId(), walPrefix,
|
||||
this.replicationQueueInfo.getQueueId());
|
||||
LOG.trace("{} Added wal {} to queue of source {}.", logPeerId(), walGroupId, queueId);
|
||||
}
|
||||
}
|
||||
|
||||
@InterfaceAudience.Private
|
||||
@RestrictedApi(explanation = "Should only be called in tests", link = "",
|
||||
allowedOnPath = ".*/src/test/.*")
|
||||
public Map<String, PriorityBlockingQueue<Path>> getQueues() {
|
||||
return logQueue.getQueues();
|
||||
}
|
||||
|
@ -330,6 +336,30 @@ public class ReplicationSource implements ReplicationSourceInterface {
|
|||
this.walEntryFilter = new ChainWALEntryFilter(filters);
|
||||
}
|
||||
|
||||
private long getStartOffset(String walGroupId) {
|
||||
ReplicationGroupOffset startOffset = startOffsets.get(walGroupId);
|
||||
if (startOffset == null || startOffset == ReplicationGroupOffset.BEGIN) {
|
||||
return 0L;
|
||||
}
|
||||
// this method will only be called when start new shipper, and we will only start new shipper
|
||||
// when there is a new queue, so here the queue for walGroupId will never be null.
|
||||
Path first = logQueue.getQueue(walGroupId).peek();
|
||||
if (!startOffset.getWal().equals(first.getName())) {
|
||||
return 0L;
|
||||
}
|
||||
// Usually, if we arrive here, the start offset should never be -1, as it means this file has
|
||||
// been fully replicated so we should have filtered it out in upper layer, usually in
|
||||
// ReplicationSourceManager. Add a warn message for safety, as usually replicate more data will
|
||||
// not cause big problems.
|
||||
if (startOffset.getOffset() < 0) {
|
||||
LOG.warn("Should have already replicated wal {}, return start offset as 0",
|
||||
startOffset.getWal());
|
||||
return 0L;
|
||||
} else {
|
||||
return startOffset.getOffset();
|
||||
}
|
||||
}
|
||||
|
||||
private void tryStartNewShipper(String walGroupId) {
|
||||
workerThreads.compute(walGroupId, (key, value) -> {
|
||||
if (value != null) {
|
||||
|
@ -339,7 +369,7 @@ public class ReplicationSource implements ReplicationSourceInterface {
|
|||
LOG.debug("{} starting shipping worker for walGroupId={}", logPeerId(), walGroupId);
|
||||
ReplicationSourceShipper worker = createNewShipper(walGroupId);
|
||||
ReplicationSourceWALReader walReader =
|
||||
createNewWALReader(walGroupId, worker.getStartPosition());
|
||||
createNewWALReader(walGroupId, getStartOffset(walGroupId));
|
||||
Threads.setDaemonThreadRunning(
|
||||
walReader, Thread.currentThread().getName() + ".replicationSource.wal-reader."
|
||||
+ walGroupId + "," + queueId,
|
||||
|
@ -568,7 +598,7 @@ public class ReplicationSource implements ReplicationSourceInterface {
|
|||
}
|
||||
|
||||
LOG.info("{} queueId={} (queues={}) is replicating from cluster={} to cluster={}", logPeerId(),
|
||||
this.replicationQueueInfo.getQueueId(), logQueue.getNumQueues(), clusterId, peerClusterId);
|
||||
queueId, logQueue.getNumQueues(), clusterId, peerClusterId);
|
||||
initializeWALEntryFilter(peerClusterId);
|
||||
// Start workers
|
||||
for (String walGroupId : logQueue.getQueues().keySet()) {
|
||||
|
@ -715,7 +745,7 @@ public class ReplicationSource implements ReplicationSourceInterface {
|
|||
}
|
||||
|
||||
@Override
|
||||
public String getQueueId() {
|
||||
public ReplicationQueueId getQueueId() {
|
||||
return this.queueId;
|
||||
}
|
||||
|
||||
|
@ -735,10 +765,6 @@ public class ReplicationSource implements ReplicationSourceInterface {
|
|||
return !this.server.isStopped() && this.sourceRunning;
|
||||
}
|
||||
|
||||
public ReplicationQueueInfo getReplicationQueueInfo() {
|
||||
return replicationQueueInfo;
|
||||
}
|
||||
|
||||
public boolean isWorkerRunning() {
|
||||
for (ReplicationSourceShipper worker : this.workerThreads.values()) {
|
||||
if (worker.isActive()) {
|
||||
|
@ -791,7 +817,7 @@ public class ReplicationSource implements ReplicationSourceInterface {
|
|||
|
||||
@Override
|
||||
public ServerName getServerWALsBelongTo() {
|
||||
return server.getServerName();
|
||||
return queueId.getServerWALsBelongTo();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -18,7 +18,7 @@
|
|||
package org.apache.hadoop.hbase.replication.regionserver;
|
||||
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueInfo;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueId;
|
||||
import org.apache.yetus.audience.InterfaceAudience;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -34,9 +34,8 @@ public final class ReplicationSourceFactory {
|
|||
private ReplicationSourceFactory() {
|
||||
}
|
||||
|
||||
static ReplicationSourceInterface create(Configuration conf, String queueId) {
|
||||
ReplicationQueueInfo replicationQueueInfo = new ReplicationQueueInfo(queueId);
|
||||
boolean isQueueRecovered = replicationQueueInfo.isQueueRecovered();
|
||||
static ReplicationSourceInterface create(Configuration conf, ReplicationQueueId queueId) {
|
||||
boolean isQueueRecovered = queueId.isRecovered();
|
||||
ReplicationSourceInterface src;
|
||||
try {
|
||||
String defaultReplicationSourceImpl = isQueueRecovered
|
||||
|
|
|
@ -31,6 +31,8 @@ import org.apache.hadoop.hbase.TableName;
|
|||
import org.apache.hadoop.hbase.replication.ReplicationEndpoint;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationException;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeer;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueData;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueId;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
|
||||
import org.apache.hadoop.hbase.util.Pair;
|
||||
import org.apache.hadoop.hbase.wal.WAL.Entry;
|
||||
|
@ -43,14 +45,22 @@ import org.apache.yetus.audience.InterfaceAudience;
|
|||
public interface ReplicationSourceInterface {
|
||||
/**
|
||||
* Initializer for the source
|
||||
* @param conf the configuration to use
|
||||
* @param fs the file system to use
|
||||
* @param manager the manager to use
|
||||
* @param server the server for this region server
|
||||
* @param conf the configuration to use
|
||||
* @param fs the file system to use
|
||||
* @param manager the manager to use
|
||||
* @param queueStorage the replication queue storage
|
||||
* @param replicationPeer the replication peer
|
||||
* @param server the server for this region server
|
||||
* @param queueData the existing replication queue data, contains the queue id and
|
||||
* replication start offsets
|
||||
* @param clusterId the cluster id
|
||||
* @param walFileLengthProvider for getting the length of the WAL file which is currently being
|
||||
* written
|
||||
* @param metrics the replication metrics
|
||||
*/
|
||||
void init(Configuration conf, FileSystem fs, ReplicationSourceManager manager,
|
||||
ReplicationQueueStorage queueStorage, ReplicationPeer replicationPeer, Server server,
|
||||
String queueId, UUID clusterId, WALFileLengthProvider walFileLengthProvider,
|
||||
ReplicationQueueData queueData, UUID clusterId, WALFileLengthProvider walFileLengthProvider,
|
||||
MetricsSource metrics) throws IOException;
|
||||
|
||||
/**
|
||||
|
@ -106,14 +116,14 @@ public interface ReplicationSourceInterface {
|
|||
* Get the queue id that the source is replicating to
|
||||
* @return queue id
|
||||
*/
|
||||
String getQueueId();
|
||||
ReplicationQueueId getQueueId();
|
||||
|
||||
/**
|
||||
* Get the id that the source is replicating to.
|
||||
* @return peer id
|
||||
*/
|
||||
default String getPeerId() {
|
||||
return getPeer().getId();
|
||||
return getQueueId().getPeerId();
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -183,7 +193,7 @@ public interface ReplicationSourceInterface {
|
|||
|
||||
/** Returns whether this is a replication source for recovery. */
|
||||
default boolean isRecovered() {
|
||||
return false;
|
||||
return getQueueId().isRecovered();
|
||||
}
|
||||
|
||||
/** Returns The instance of queueStorage used by this ReplicationSource. */
|
||||
|
|
|
@ -97,7 +97,7 @@ public class ReplicationSourceLogQueue {
|
|||
* @param walGroupId walGroupId
|
||||
*/
|
||||
public int getQueueSize(String walGroupId) {
|
||||
Queue queue = queues.get(walGroupId);
|
||||
Queue<Path> queue = queues.get(walGroupId);
|
||||
if (queue == null) {
|
||||
return 0;
|
||||
}
|
||||
|
@ -117,7 +117,7 @@ public class ReplicationSourceLogQueue {
|
|||
|
||||
/**
|
||||
* Return queue for the given walGroupId Please don't add or remove elements from the returned
|
||||
* queue. Use @enqueueLog and @remove methods respectively.
|
||||
* queue. Use {@link #enqueueLog(Path, String)} and {@link #remove(String)} methods respectively.
|
||||
* @param walGroupId walGroupId
|
||||
*/
|
||||
public PriorityBlockingQueue<Path> getQueue(String walGroupId) {
|
||||
|
|
|
@ -17,17 +17,22 @@
|
|||
*/
|
||||
package org.apache.hadoop.hbase.replication.regionserver;
|
||||
|
||||
import com.google.errorprone.annotations.RestrictedApi;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.net.URLEncoder;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.Collections;
|
||||
import java.util.Comparator;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NavigableSet;
|
||||
import java.util.OptionalLong;
|
||||
import java.util.PriorityQueue;
|
||||
import java.util.Set;
|
||||
import java.util.SortedSet;
|
||||
import java.util.TreeSet;
|
||||
|
@ -49,12 +54,13 @@ import org.apache.hadoop.hbase.ServerName;
|
|||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationException;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationGroupOffset;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeer;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeer.PeerState;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeerConfig;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeerImpl;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeers;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueInfo;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueData;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueId;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationUtils;
|
||||
import org.apache.hadoop.hbase.replication.SyncReplicationState;
|
||||
|
@ -68,6 +74,7 @@ import org.apache.zookeeper.KeeperException;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap;
|
||||
import org.apache.hbase.thirdparty.com.google.common.collect.Sets;
|
||||
import org.apache.hbase.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder;
|
||||
|
||||
|
@ -90,26 +97,25 @@ import org.apache.hbase.thirdparty.com.google.common.util.concurrent.ThreadFacto
|
|||
* operations.</li>
|
||||
* <li>Need synchronized on {@link #walsById}. There are four methods which modify it,
|
||||
* {@link #addPeer(String)}, {@link #removePeer(String)},
|
||||
* {@link #cleanOldLogs(String, boolean, ReplicationSourceInterface)} and {@link #preLogRoll(Path)}.
|
||||
* {@link #walsById} is a ConcurrentHashMap and there is a Lock for peer id in
|
||||
* {@link PeerProcedureHandlerImpl}. So there is no race between {@link #addPeer(String)} and
|
||||
* {@link #cleanOldLogs(String, boolean, ReplicationSourceInterface)} and
|
||||
* {@link #postLogRoll(Path)}. {@link #walsById} is a ConcurrentHashMap and there is a Lock for peer
|
||||
* id in {@link PeerProcedureHandlerImpl}. So there is no race between {@link #addPeer(String)} and
|
||||
* {@link #removePeer(String)}. {@link #cleanOldLogs(String, boolean, ReplicationSourceInterface)}
|
||||
* is called by {@link ReplicationSourceInterface}. So no race with {@link #addPeer(String)}.
|
||||
* {@link #removePeer(String)} will terminate the {@link ReplicationSourceInterface} firstly, then
|
||||
* remove the wals from {@link #walsById}. So no race with {@link #removePeer(String)}. The only
|
||||
* case need synchronized is {@link #cleanOldLogs(String, boolean, ReplicationSourceInterface)} and
|
||||
* {@link #preLogRoll(Path)}.</li>
|
||||
* {@link #postLogRoll(Path)}.</li>
|
||||
* <li>No need synchronized on {@link #walsByIdRecoveredQueues}. There are three methods which
|
||||
* modify it, {@link #removePeer(String)} ,
|
||||
* {@link #cleanOldLogs(String, boolean, ReplicationSourceInterface)} and
|
||||
* {@link ReplicationSourceManager#claimQueue(ServerName, String)}.
|
||||
* {@link #claimQueue(ReplicationQueueId)}.
|
||||
* {@link #cleanOldLogs(String, boolean, ReplicationSourceInterface)} is called by
|
||||
* {@link ReplicationSourceInterface}. {@link #removePeer(String)} will terminate the
|
||||
* {@link ReplicationSourceInterface} firstly, then remove the wals from
|
||||
* {@link #walsByIdRecoveredQueues}. And
|
||||
* {@link ReplicationSourceManager#claimQueue(ServerName, String)} will add the wals to
|
||||
* {@link #walsByIdRecoveredQueues} firstly, then start up a {@link ReplicationSourceInterface}. So
|
||||
* there is no race here. For {@link ReplicationSourceManager#claimQueue(ServerName, String)} and
|
||||
* {@link #walsByIdRecoveredQueues}. And {@link #claimQueue(ReplicationQueueId)} will add the wals
|
||||
* to {@link #walsByIdRecoveredQueues} firstly, then start up a {@link ReplicationSourceInterface}.
|
||||
* So there is no race here. For {@link #claimQueue(ReplicationQueueId)} and
|
||||
* {@link #removePeer(String)}, there is already synchronized on {@link #oldsources}. So no need
|
||||
* synchronized on {@link #walsByIdRecoveredQueues}.</li>
|
||||
* <li>Need synchronized on {@link #latestPaths} to avoid the new open source miss new log.</li>
|
||||
|
@ -141,11 +147,12 @@ public class ReplicationSourceManager {
|
|||
// All logs we are currently tracking
|
||||
// Index structure of the map is: queue_id->logPrefix/logGroup->logs
|
||||
// For normal replication source, the peer id is same with the queue id
|
||||
private final ConcurrentMap<String, Map<String, NavigableSet<String>>> walsById;
|
||||
private final ConcurrentMap<ReplicationQueueId, Map<String, NavigableSet<String>>> walsById;
|
||||
// Logs for recovered sources we are currently tracking
|
||||
// the map is: queue_id->logPrefix/logGroup->logs
|
||||
// For recovered source, the queue id's format is peer_id-servername-*
|
||||
private final ConcurrentMap<String, Map<String, NavigableSet<String>>> walsByIdRecoveredQueues;
|
||||
private final ConcurrentMap<ReplicationQueueId,
|
||||
Map<String, NavigableSet<String>>> walsByIdRecoveredQueues;
|
||||
|
||||
private final SyncReplicationPeerMappingManager syncReplicationPeerMappingManager;
|
||||
|
||||
|
@ -163,8 +170,6 @@ public class ReplicationSourceManager {
|
|||
// Homemade executer service for replication
|
||||
private final ThreadPoolExecutor executor;
|
||||
|
||||
private final boolean replicationForBulkLoadDataEnabled;
|
||||
|
||||
private AtomicLong totalBufferUsed = new AtomicLong();
|
||||
|
||||
// How long should we sleep for each retry when deleting remote wal files for sync replication
|
||||
|
@ -219,8 +224,6 @@ public class ReplicationSourceManager {
|
|||
tfb.setDaemon(true);
|
||||
this.executor.setThreadFactory(tfb.build());
|
||||
this.latestPaths = new HashMap<>();
|
||||
this.replicationForBulkLoadDataEnabled = conf.getBoolean(
|
||||
HConstants.REPLICATION_BULKLOAD_ENABLE_KEY, HConstants.REPLICATION_BULKLOAD_ENABLE_DEFAULT);
|
||||
this.sleepForRetries = this.conf.getLong("replication.source.sync.sleepforretries", 1000);
|
||||
this.maxRetriesMultiplier =
|
||||
this.conf.getInt("replication.source.sync.maxretriesmultiplier", 60);
|
||||
|
@ -235,11 +238,6 @@ public class ReplicationSourceManager {
|
|||
void init() throws IOException {
|
||||
for (String id : this.replicationPeers.getAllPeerIds()) {
|
||||
addSource(id);
|
||||
if (replicationForBulkLoadDataEnabled) {
|
||||
// Check if peer exists in hfile-refs queue, if not add it. This can happen in the case
|
||||
// when a peer was added before replication for bulk loaded data was enabled.
|
||||
throwIOExceptionWhenFail(() -> this.queueStorage.addPeerToHFileRefs(id));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -260,9 +258,6 @@ public class ReplicationSourceManager {
|
|||
}
|
||||
if (added) {
|
||||
addSource(peerId);
|
||||
if (replicationForBulkLoadDataEnabled) {
|
||||
throwIOExceptionWhenFail(() -> this.queueStorage.addPeerToHFileRefs(peerId));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -293,26 +288,17 @@ public class ReplicationSourceManager {
|
|||
removeRecoveredSource(src);
|
||||
}
|
||||
}
|
||||
LOG
|
||||
.info("Number of deleted recovered sources for " + peerId + ": " + oldSourcesToDelete.size());
|
||||
LOG.info("Number of deleted recovered sources for {}: {}", peerId, oldSourcesToDelete.size());
|
||||
// Now close the normal source for this peer
|
||||
ReplicationSourceInterface srcToRemove = this.sources.get(peerId);
|
||||
if (srcToRemove != null) {
|
||||
srcToRemove.terminate(terminateMessage);
|
||||
removeSource(srcToRemove);
|
||||
} else {
|
||||
// This only happened in unit test TestReplicationSourceManager#testPeerRemovalCleanup
|
||||
// Delete queue from storage and memory and queue id is same with peer id for normal
|
||||
// source
|
||||
deleteQueue(peerId);
|
||||
this.walsById.remove(peerId);
|
||||
}
|
||||
ReplicationPeerConfig peerConfig = peer.getPeerConfig();
|
||||
if (peerConfig.isSyncReplication()) {
|
||||
syncReplicationPeerMappingManager.remove(peerId, peerConfig);
|
||||
}
|
||||
// Remove HFile Refs
|
||||
abortWhenFail(() -> this.queueStorage.removePeerFromHFileRefs(peerId));
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -320,17 +306,17 @@ public class ReplicationSourceManager {
|
|||
* @param queueId the id of the replication queue to associate the ReplicationSource with.
|
||||
* @see #createCatalogReplicationSource(RegionInfo) for creating a ReplicationSource for meta.
|
||||
*/
|
||||
private ReplicationSourceInterface createSource(String queueId, ReplicationPeer replicationPeer)
|
||||
throws IOException {
|
||||
ReplicationSourceInterface src = ReplicationSourceFactory.create(conf, queueId);
|
||||
private ReplicationSourceInterface createSource(ReplicationQueueData queueData,
|
||||
ReplicationPeer replicationPeer) throws IOException {
|
||||
ReplicationSourceInterface src = ReplicationSourceFactory.create(conf, queueData.getId());
|
||||
// Init the just created replication source. Pass the default walProvider's wal file length
|
||||
// provider. Presumption is we replicate user-space Tables only. For hbase:meta region replica
|
||||
// replication, see #createCatalogReplicationSource().
|
||||
WALFileLengthProvider walFileLengthProvider = this.walFactory.getWALProvider() != null
|
||||
? this.walFactory.getWALProvider().getWALFileLengthProvider()
|
||||
: p -> OptionalLong.empty();
|
||||
src.init(conf, fs, this, queueStorage, replicationPeer, server, queueId, clusterId,
|
||||
walFileLengthProvider, new MetricsSource(queueId));
|
||||
src.init(conf, fs, this, queueStorage, replicationPeer, server, queueData, clusterId,
|
||||
walFileLengthProvider, new MetricsSource(queueData.getId().toString()));
|
||||
return src;
|
||||
}
|
||||
|
||||
|
@ -351,12 +337,14 @@ public class ReplicationSourceManager {
|
|||
LOG.info("Legacy region replication peer found, skip adding: {}", peer.getPeerConfig());
|
||||
return;
|
||||
}
|
||||
ReplicationSourceInterface src = createSource(peerId, peer);
|
||||
ReplicationQueueId queueId = new ReplicationQueueId(server.getServerName(), peerId);
|
||||
ReplicationSourceInterface src =
|
||||
createSource(new ReplicationQueueData(queueId, ImmutableMap.of()), peer);
|
||||
// synchronized on latestPaths to avoid missing the new log
|
||||
synchronized (this.latestPaths) {
|
||||
this.sources.put(peerId, src);
|
||||
Map<String, NavigableSet<String>> walsByGroup = new HashMap<>();
|
||||
this.walsById.put(peerId, walsByGroup);
|
||||
this.walsById.put(queueId, walsByGroup);
|
||||
// Add the latest wal to that source's queue
|
||||
if (!latestPaths.isEmpty()) {
|
||||
for (Map.Entry<String, Path> walPrefixAndPath : latestPaths.entrySet()) {
|
||||
|
@ -365,8 +353,10 @@ public class ReplicationSourceManager {
|
|||
wals.add(walPath.getName());
|
||||
walsByGroup.put(walPrefixAndPath.getKey(), wals);
|
||||
// Abort RS and throw exception to make add peer failed
|
||||
// TODO: can record the length of the current wal file so we could replicate less data
|
||||
abortAndThrowIOExceptionWhenFail(
|
||||
() -> this.queueStorage.addWAL(server.getServerName(), peerId, walPath.getName()));
|
||||
() -> this.queueStorage.setOffset(queueId, walPrefixAndPath.getKey(),
|
||||
new ReplicationGroupOffset(walPath.getName(), 0), Collections.emptyMap()));
|
||||
src.enqueueLog(walPath);
|
||||
LOG.trace("Enqueued {} to source {} during source creation.", walPath, src.getQueueId());
|
||||
}
|
||||
|
@ -399,7 +389,10 @@ public class ReplicationSourceManager {
|
|||
+ " is transiting to STANDBY. Will close the previous replication source and open a new one";
|
||||
ReplicationPeer peer = replicationPeers.getPeer(peerId);
|
||||
assert peer.getPeerConfig().isSyncReplication();
|
||||
ReplicationSourceInterface src = createSource(peerId, peer);
|
||||
ReplicationQueueId queueId = new ReplicationQueueId(server.getServerName(), peerId);
|
||||
// TODO: use empty initial offsets for now, revisit when adding support for sync replication
|
||||
ReplicationSourceInterface src =
|
||||
createSource(new ReplicationQueueData(queueId, ImmutableMap.of()), peer);
|
||||
// synchronized here to avoid race with preLogRoll where we add new log to source and also
|
||||
// walsById.
|
||||
ReplicationSourceInterface toRemove;
|
||||
|
@ -416,17 +409,18 @@ public class ReplicationSourceManager {
|
|||
// map from walsById since later we may fail to delete them from the replication queue
|
||||
// storage, and when we retry next time, we can not know the wal files that need to be deleted
|
||||
// from the replication queue storage.
|
||||
walsById.get(peerId).forEach((k, v) -> wals.put(k, new TreeSet<>(v)));
|
||||
walsById.get(queueId).forEach((k, v) -> wals.put(k, new TreeSet<>(v)));
|
||||
}
|
||||
LOG.info("Startup replication source for " + src.getPeerId());
|
||||
src.startup();
|
||||
for (NavigableSet<String> walsByGroup : wals.values()) {
|
||||
for (String wal : walsByGroup) {
|
||||
queueStorage.removeWAL(server.getServerName(), peerId, wal);
|
||||
}
|
||||
// TODO: just need to reset the replication offset
|
||||
// for (String wal : walsByGroup) {
|
||||
// queueStorage.removeWAL(server.getServerName(), peerId, wal);
|
||||
// }
|
||||
}
|
||||
synchronized (walsById) {
|
||||
Map<String, NavigableSet<String>> oldWals = walsById.get(peerId);
|
||||
Map<String, NavigableSet<String>> oldWals = walsById.get(queueId);
|
||||
wals.forEach((k, v) -> {
|
||||
NavigableSet<String> walsByGroup = oldWals.get(k);
|
||||
if (walsByGroup != null) {
|
||||
|
@ -441,17 +435,28 @@ public class ReplicationSourceManager {
|
|||
for (Iterator<ReplicationSourceInterface> iter = oldsources.iterator(); iter.hasNext();) {
|
||||
ReplicationSourceInterface oldSource = iter.next();
|
||||
if (oldSource.getPeerId().equals(peerId)) {
|
||||
String queueId = oldSource.getQueueId();
|
||||
ReplicationQueueId oldSourceQueueId = oldSource.getQueueId();
|
||||
oldSource.terminate(terminateMessage);
|
||||
oldSource.getSourceMetrics().clear();
|
||||
queueStorage.removeQueue(server.getServerName(), queueId);
|
||||
walsByIdRecoveredQueues.remove(queueId);
|
||||
queueStorage.removeQueue(oldSourceQueueId);
|
||||
walsByIdRecoveredQueues.remove(oldSourceQueueId);
|
||||
iter.remove();
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private ReplicationSourceInterface createRefreshedSource(ReplicationQueueId queueId,
|
||||
ReplicationPeer peer) throws IOException {
|
||||
Map<String, ReplicationGroupOffset> offsets;
|
||||
try {
|
||||
offsets = queueStorage.getOffsets(queueId);
|
||||
} catch (ReplicationException e) {
|
||||
throw new IOException(e);
|
||||
}
|
||||
return createSource(new ReplicationQueueData(queueId, ImmutableMap.copyOf(offsets)), peer);
|
||||
}
|
||||
|
||||
/**
|
||||
* Close the previous replication sources of this peer id and open new sources to trigger the new
|
||||
* replication state changes or new replication config changes. Here we don't need to change
|
||||
|
@ -462,6 +467,7 @@ public class ReplicationSourceManager {
|
|||
String terminateMessage = "Peer " + peerId
|
||||
+ " state or config changed. Will close the previous replication source and open a new one";
|
||||
ReplicationPeer peer = replicationPeers.getPeer(peerId);
|
||||
ReplicationQueueId queueId = new ReplicationQueueId(server.getServerName(), peerId);
|
||||
ReplicationSourceInterface src;
|
||||
// synchronized on latestPaths to avoid missing the new log
|
||||
synchronized (this.latestPaths) {
|
||||
|
@ -471,9 +477,9 @@ public class ReplicationSourceManager {
|
|||
// Do not clear metrics
|
||||
toRemove.terminate(terminateMessage, null, false);
|
||||
}
|
||||
src = createSource(peerId, peer);
|
||||
src = createRefreshedSource(queueId, peer);
|
||||
this.sources.put(peerId, src);
|
||||
for (NavigableSet<String> walsByGroup : walsById.get(peerId).values()) {
|
||||
for (NavigableSet<String> walsByGroup : walsById.get(queueId).values()) {
|
||||
walsByGroup.forEach(wal -> src.enqueueLog(new Path(this.logDir, wal)));
|
||||
}
|
||||
}
|
||||
|
@ -483,20 +489,22 @@ public class ReplicationSourceManager {
|
|||
List<ReplicationSourceInterface> toStartup = new ArrayList<>();
|
||||
// synchronized on oldsources to avoid race with NodeFailoverWorker
|
||||
synchronized (this.oldsources) {
|
||||
List<String> previousQueueIds = new ArrayList<>();
|
||||
List<ReplicationQueueId> oldSourceQueueIds = new ArrayList<>();
|
||||
for (Iterator<ReplicationSourceInterface> iter = this.oldsources.iterator(); iter
|
||||
.hasNext();) {
|
||||
ReplicationSourceInterface oldSource = iter.next();
|
||||
if (oldSource.getPeerId().equals(peerId)) {
|
||||
previousQueueIds.add(oldSource.getQueueId());
|
||||
oldSourceQueueIds.add(oldSource.getQueueId());
|
||||
oldSource.terminate(terminateMessage);
|
||||
iter.remove();
|
||||
}
|
||||
}
|
||||
for (String queueId : previousQueueIds) {
|
||||
ReplicationSourceInterface recoveredReplicationSource = createSource(queueId, peer);
|
||||
for (ReplicationQueueId oldSourceQueueId : oldSourceQueueIds) {
|
||||
ReplicationSourceInterface recoveredReplicationSource =
|
||||
createRefreshedSource(oldSourceQueueId, peer);
|
||||
this.oldsources.add(recoveredReplicationSource);
|
||||
for (SortedSet<String> walsByGroup : walsByIdRecoveredQueues.get(queueId).values()) {
|
||||
for (SortedSet<String> walsByGroup : walsByIdRecoveredQueues.get(oldSourceQueueId)
|
||||
.values()) {
|
||||
walsByGroup.forEach(wal -> recoveredReplicationSource.enqueueLog(new Path(wal)));
|
||||
}
|
||||
toStartup.add(recoveredReplicationSource);
|
||||
|
@ -549,8 +557,8 @@ public class ReplicationSourceManager {
|
|||
* Delete a complete queue of wals associated with a replication source
|
||||
* @param queueId the id of replication queue to delete
|
||||
*/
|
||||
private void deleteQueue(String queueId) {
|
||||
abortWhenFail(() -> this.queueStorage.removeQueue(server.getServerName(), queueId));
|
||||
private void deleteQueue(ReplicationQueueId queueId) {
|
||||
abortWhenFail(() -> this.queueStorage.removeQueue(queueId));
|
||||
}
|
||||
|
||||
@FunctionalInterface
|
||||
|
@ -616,10 +624,15 @@ public class ReplicationSourceManager {
|
|||
*/
|
||||
public void logPositionAndCleanOldLogs(ReplicationSourceInterface source,
|
||||
WALEntryBatch entryBatch) {
|
||||
String fileName = entryBatch.getLastWalPath().getName();
|
||||
interruptOrAbortWhenFail(() -> this.queueStorage.setWALPosition(server.getServerName(),
|
||||
source.getQueueId(), fileName, entryBatch.getLastWalPosition(), entryBatch.getLastSeqIds()));
|
||||
cleanOldLogs(fileName, entryBatch.isEndOfFile(), source);
|
||||
String walName = entryBatch.getLastWalPath().getName();
|
||||
String walPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(walName);
|
||||
// if end of file, we just set the offset to -1 so we know that this file has already been fully
|
||||
// replicated, otherwise we need to compare the file length
|
||||
ReplicationGroupOffset offset = new ReplicationGroupOffset(walName,
|
||||
entryBatch.isEndOfFile() ? -1 : entryBatch.getLastWalPosition());
|
||||
interruptOrAbortWhenFail(() -> this.queueStorage.setOffset(source.getQueueId(), walPrefix,
|
||||
offset, entryBatch.getLastSeqIds()));
|
||||
cleanOldLogs(walName, entryBatch.isEndOfFile(), source);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -644,7 +657,7 @@ public class ReplicationSourceManager {
|
|||
} else {
|
||||
NavigableSet<String> wals;
|
||||
NavigableSet<String> walsToRemove;
|
||||
// synchronized on walsById to avoid race with preLogRoll
|
||||
// synchronized on walsById to avoid race with postLogRoll
|
||||
synchronized (this.walsById) {
|
||||
wals = walsById.get(source.getQueueId()).get(logPrefix);
|
||||
if (wals == null) {
|
||||
|
@ -726,33 +739,21 @@ public class ReplicationSourceManager {
|
|||
}
|
||||
}
|
||||
}
|
||||
String queueId = source.getQueueId();
|
||||
for (String wal : wals) {
|
||||
interruptOrAbortWhenFail(
|
||||
() -> this.queueStorage.removeWAL(server.getServerName(), queueId, wal));
|
||||
}
|
||||
}
|
||||
|
||||
// public because of we call it in TestReplicationEmptyWALRecovery
|
||||
public void preLogRoll(Path newLog) throws IOException {
|
||||
public void postLogRoll(Path newLog) throws IOException {
|
||||
String logName = newLog.getName();
|
||||
String logPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(logName);
|
||||
// synchronized on latestPaths to avoid the new open source miss the new log
|
||||
synchronized (this.latestPaths) {
|
||||
// Add log to queue storage
|
||||
for (ReplicationSourceInterface source : this.sources.values()) {
|
||||
// If record log to queue storage failed, abort RS and throw exception to make log roll
|
||||
// failed
|
||||
abortAndThrowIOExceptionWhenFail(
|
||||
() -> this.queueStorage.addWAL(server.getServerName(), source.getQueueId(), logName));
|
||||
}
|
||||
|
||||
// synchronized on walsById to avoid race with cleanOldLogs
|
||||
synchronized (this.walsById) {
|
||||
// Update walsById map
|
||||
for (Map.Entry<String, Map<String, NavigableSet<String>>> entry : this.walsById
|
||||
for (Map.Entry<ReplicationQueueId, Map<String, NavigableSet<String>>> entry : this.walsById
|
||||
.entrySet()) {
|
||||
String peerId = entry.getKey();
|
||||
ReplicationQueueId queueId = entry.getKey();
|
||||
String peerId = queueId.getPeerId();
|
||||
Map<String, NavigableSet<String>> walsByPrefix = entry.getValue();
|
||||
boolean existingPrefix = false;
|
||||
for (Map.Entry<String, NavigableSet<String>> walsEntry : walsByPrefix.entrySet()) {
|
||||
|
@ -780,10 +781,6 @@ public class ReplicationSourceManager {
|
|||
// Add to latestPaths
|
||||
latestPaths.put(logPrefix, newLog);
|
||||
}
|
||||
}
|
||||
|
||||
// public because of we call it in TestReplicationEmptyWALRecovery
|
||||
public void postLogRoll(Path newLog) throws IOException {
|
||||
// This only updates the sources we own, not the recovered ones
|
||||
for (ReplicationSourceInterface source : this.sources.values()) {
|
||||
source.enqueueLog(newLog);
|
||||
|
@ -792,7 +789,29 @@ public class ReplicationSourceManager {
|
|||
}
|
||||
}
|
||||
|
||||
void claimQueue(ServerName deadRS, String queue) {
|
||||
/**
|
||||
* Check whether we should replicate the given {@code wal}.
|
||||
* @param wal the file name of the wal
|
||||
* @return {@code true} means we should replicate the given {@code wal}, otherwise {@code false}.
|
||||
*/
|
||||
private boolean shouldReplicate(ReplicationGroupOffset offset, String wal) {
|
||||
if (offset == null || offset == ReplicationGroupOffset.BEGIN) {
|
||||
return false;
|
||||
}
|
||||
long walTs = AbstractFSWALProvider.getTimestamp(wal);
|
||||
long startWalTs = AbstractFSWALProvider.getTimestamp(offset.getWal());
|
||||
if (walTs < startWalTs) {
|
||||
return false;
|
||||
} else if (walTs > startWalTs) {
|
||||
return true;
|
||||
}
|
||||
// if the timestamp equals, usually it means we should include this wal but there is a special
|
||||
// case, a negative offset means the wal has already been fully replicated, so here we should
|
||||
// check the offset.
|
||||
return offset.getOffset() >= 0;
|
||||
}
|
||||
|
||||
void claimQueue(ReplicationQueueId queueId) {
|
||||
// Wait a bit before transferring the queues, we may be shutting down.
|
||||
// This sleep may not be enough in some cases.
|
||||
try {
|
||||
|
@ -807,66 +826,83 @@ public class ReplicationSourceManager {
|
|||
LOG.info("Not transferring queue since we are shutting down");
|
||||
return;
|
||||
}
|
||||
// After claim the queues from dead region server, wewill skip to start the
|
||||
// After claim the queues from dead region server, we will skip to start the
|
||||
// RecoveredReplicationSource if the peer has been removed. but there's possible that remove a
|
||||
// peer with peerId = 2 and add a peer with peerId = 2 again during failover. So we need to get
|
||||
// a copy of the replication peer first to decide whether we should start the
|
||||
// RecoveredReplicationSource. If the latest peer is not the old peer, we should also skip to
|
||||
// start the RecoveredReplicationSource, Otherwise the rs will abort (See HBASE-20475).
|
||||
String peerId = new ReplicationQueueInfo(queue).getPeerId();
|
||||
String peerId = queueId.getPeerId();
|
||||
ReplicationPeerImpl oldPeer = replicationPeers.getPeer(peerId);
|
||||
if (oldPeer == null) {
|
||||
LOG.info("Not transferring queue since the replication peer {} for queue {} does not exist",
|
||||
peerId, queue);
|
||||
peerId, queueId);
|
||||
return;
|
||||
}
|
||||
Pair<String, SortedSet<String>> claimedQueue;
|
||||
Map<String, ReplicationGroupOffset> offsets;
|
||||
try {
|
||||
claimedQueue = queueStorage.claimQueue(deadRS, queue, server.getServerName());
|
||||
offsets = queueStorage.claimQueue(queueId, server.getServerName());
|
||||
} catch (ReplicationException e) {
|
||||
LOG.error(
|
||||
"ReplicationException: cannot claim dead region ({})'s " + "replication queue. Znode : ({})"
|
||||
+ " Possible solution: check if znode size exceeds jute.maxBuffer value. "
|
||||
+ " If so, increase it for both client and server side.",
|
||||
deadRS, queueStorage.getRsNode(deadRS), e);
|
||||
LOG.error("ReplicationException: cannot claim dead region ({})'s replication queue",
|
||||
queueId.getServerName(), e);
|
||||
server.abort("Failed to claim queue from dead regionserver.", e);
|
||||
return;
|
||||
}
|
||||
if (claimedQueue.getSecond().isEmpty()) {
|
||||
if (offsets.isEmpty()) {
|
||||
// someone else claimed the queue
|
||||
return;
|
||||
}
|
||||
String queueId = claimedQueue.getFirst();
|
||||
Set<String> walsSet = claimedQueue.getSecond();
|
||||
ServerName sourceRS = queueId.getServerWALsBelongTo();
|
||||
ReplicationQueueId claimedQueueId = queueId.claim(server.getServerName());
|
||||
ReplicationPeerImpl peer = replicationPeers.getPeer(peerId);
|
||||
if (peer == null || peer != oldPeer) {
|
||||
LOG.warn("Skipping failover for peer {} of node {}, peer is null", peerId, deadRS);
|
||||
abortWhenFail(() -> queueStorage.removeQueue(server.getServerName(), queueId));
|
||||
LOG.warn("Skipping failover for peer {} of node {}, peer is null", peerId, sourceRS);
|
||||
deleteQueue(claimedQueueId);
|
||||
return;
|
||||
}
|
||||
if (
|
||||
server instanceof ReplicationSyncUp.DummyServer
|
||||
&& peer.getPeerState().equals(PeerState.DISABLED)
|
||||
) {
|
||||
LOG.warn(
|
||||
"Peer {} is disabled. ReplicationSyncUp tool will skip " + "replicating data to this peer.",
|
||||
peerId);
|
||||
return;
|
||||
}
|
||||
|
||||
ReplicationSourceInterface src;
|
||||
try {
|
||||
src = createSource(queueId, peer);
|
||||
src =
|
||||
createSource(new ReplicationQueueData(claimedQueueId, ImmutableMap.copyOf(offsets)), peer);
|
||||
} catch (IOException e) {
|
||||
LOG.error("Can not create replication source for peer {} and queue {}", peerId, queueId, e);
|
||||
LOG.error("Can not create replication source for peer {} and queue {}", peerId,
|
||||
claimedQueueId, e);
|
||||
server.abort("Failed to create replication source after claiming queue.", e);
|
||||
return;
|
||||
}
|
||||
List<Path> walFiles;
|
||||
try {
|
||||
walFiles = AbstractFSWALProvider.getArchivedWALFiles(conf, sourceRS,
|
||||
URLEncoder.encode(sourceRS.toString(), StandardCharsets.UTF_8.name()));
|
||||
} catch (IOException e) {
|
||||
LOG.error("Can not list all wal files for peer {} and queue {}", peerId, queueId, e);
|
||||
server.abort("Can not list all wal files after claiming queue.", e);
|
||||
return;
|
||||
}
|
||||
PriorityQueue<Path> walFilesPQ = new PriorityQueue<>(
|
||||
Comparator.<Path, Long> comparing(p -> AbstractFSWALProvider.getTimestamp(p.getName()))
|
||||
.thenComparing(Path::getName));
|
||||
// sort the wal files and also filter out replicated files
|
||||
for (Path file : walFiles) {
|
||||
String walGroupId = AbstractFSWALProvider.getWALPrefixFromWALName(file.getName());
|
||||
ReplicationGroupOffset groupOffset = offsets.get(walGroupId);
|
||||
if (shouldReplicate(groupOffset, file.getName())) {
|
||||
walFilesPQ.add(file);
|
||||
} else {
|
||||
LOG.debug("Skip enqueuing log {} because it is before the start offset {}", file.getName(),
|
||||
groupOffset);
|
||||
}
|
||||
walFilesPQ.add(file);
|
||||
}
|
||||
// the method is a bit long, so assign it to null here to avoid later we reuse it again by
|
||||
// mistake, we should use the sorted walFilesPQ instead
|
||||
walFiles = null;
|
||||
// synchronized on oldsources to avoid adding recovered source for the to-be-removed peer
|
||||
synchronized (oldsources) {
|
||||
peer = replicationPeers.getPeer(src.getPeerId());
|
||||
if (peer == null || peer != oldPeer) {
|
||||
src.terminate("Recovered queue doesn't belong to any current peer");
|
||||
deleteQueue(queueId);
|
||||
deleteQueue(claimedQueueId);
|
||||
return;
|
||||
}
|
||||
// Do not setup recovered queue if a sync replication peer is in STANDBY state, or is
|
||||
|
@ -882,26 +918,26 @@ public class ReplicationSourceManager {
|
|||
|| stateAndNewState.getSecond().equals(SyncReplicationState.STANDBY)
|
||||
) {
|
||||
src.terminate("Sync replication peer is in STANDBY state");
|
||||
deleteQueue(queueId);
|
||||
deleteQueue(claimedQueueId);
|
||||
return;
|
||||
}
|
||||
}
|
||||
// track sources in walsByIdRecoveredQueues
|
||||
Map<String, NavigableSet<String>> walsByGroup = new HashMap<>();
|
||||
walsByIdRecoveredQueues.put(queueId, walsByGroup);
|
||||
for (String wal : walsSet) {
|
||||
String walPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(wal);
|
||||
walsByIdRecoveredQueues.put(claimedQueueId, walsByGroup);
|
||||
for (Path wal : walFilesPQ) {
|
||||
String walPrefix = AbstractFSWALProvider.getWALPrefixFromWALName(wal.getName());
|
||||
NavigableSet<String> wals = walsByGroup.get(walPrefix);
|
||||
if (wals == null) {
|
||||
wals = new TreeSet<>();
|
||||
walsByGroup.put(walPrefix, wals);
|
||||
}
|
||||
wals.add(wal);
|
||||
wals.add(wal.getName());
|
||||
}
|
||||
oldsources.add(src);
|
||||
LOG.info("Added source for recovered queue {}", src.getQueueId());
|
||||
for (String wal : walsSet) {
|
||||
LOG.trace("Enqueueing log from recovered queue for source: " + src.getQueueId());
|
||||
LOG.info("Added source for recovered queue {}", claimedQueueId);
|
||||
for (Path wal : walFilesPQ) {
|
||||
LOG.debug("Enqueueing log {} from recovered queue for source: {}", wal, claimedQueueId);
|
||||
src.enqueueLog(new Path(oldLogDir, wal));
|
||||
}
|
||||
src.startup();
|
||||
|
@ -927,7 +963,9 @@ public class ReplicationSourceManager {
|
|||
* Get a copy of the wals of the normal sources on this rs
|
||||
* @return a sorted set of wal names
|
||||
*/
|
||||
public Map<String, Map<String, NavigableSet<String>>> getWALs() {
|
||||
@RestrictedApi(explanation = "Should only be called in tests", link = "",
|
||||
allowedOnPath = ".*/src/test/.*")
|
||||
public Map<ReplicationQueueId, Map<String, NavigableSet<String>>> getWALs() {
|
||||
return Collections.unmodifiableMap(walsById);
|
||||
}
|
||||
|
||||
|
@ -935,7 +973,9 @@ public class ReplicationSourceManager {
|
|||
* Get a copy of the wals of the recovered sources on this rs
|
||||
* @return a sorted set of wal names
|
||||
*/
|
||||
Map<String, Map<String, NavigableSet<String>>> getWalsByIdRecoveredQueues() {
|
||||
@RestrictedApi(explanation = "Should only be called in tests", link = "",
|
||||
allowedOnPath = ".*/src/test/.*")
|
||||
Map<ReplicationQueueId, Map<String, NavigableSet<String>>> getWalsByIdRecoveredQueues() {
|
||||
return Collections.unmodifiableMap(walsByIdRecoveredQueues);
|
||||
}
|
||||
|
||||
|
@ -963,16 +1003,6 @@ public class ReplicationSourceManager {
|
|||
return this.sources.get(peerId);
|
||||
}
|
||||
|
||||
List<String> getAllQueues() throws IOException {
|
||||
List<String> allQueues = Collections.emptyList();
|
||||
try {
|
||||
allQueues = queueStorage.getAllQueues(server.getServerName());
|
||||
} catch (ReplicationException e) {
|
||||
throw new IOException(e);
|
||||
}
|
||||
return allQueues;
|
||||
}
|
||||
|
||||
int getSizeOfLatestPath() {
|
||||
synchronized (latestPaths) {
|
||||
return latestPaths.size();
|
||||
|
@ -1068,6 +1098,8 @@ public class ReplicationSourceManager {
|
|||
return this.globalMetrics;
|
||||
}
|
||||
|
||||
@RestrictedApi(explanation = "Should only be called in tests", link = "",
|
||||
allowedOnPath = ".*/src/test/.*")
|
||||
ReplicationQueueStorage getQueueStorage() {
|
||||
return queueStorage;
|
||||
}
|
||||
|
|
|
@ -57,7 +57,7 @@ public class ReplicationSourceShipper extends Thread {
|
|||
private final Configuration conf;
|
||||
protected final String walGroupId;
|
||||
protected final ReplicationSourceLogQueue logQueue;
|
||||
private final ReplicationSource source;
|
||||
protected final ReplicationSource source;
|
||||
|
||||
// Last position in the log that we sent to ZooKeeper
|
||||
// It will be accessed by the stats thread so make it volatile
|
||||
|
@ -299,10 +299,6 @@ public class ReplicationSourceShipper extends Thread {
|
|||
this.entryReader = entryReader;
|
||||
}
|
||||
|
||||
long getStartPosition() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
protected boolean isActive() {
|
||||
return source.isSourceActive() && state == WorkerState.RUNNING && !isInterrupted();
|
||||
}
|
||||
|
|
|
@ -43,11 +43,6 @@ class ReplicationSourceWALActionListener implements WALActionsListener {
|
|||
this.manager = manager;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void preLogRoll(Path oldPath, Path newPath) throws IOException {
|
||||
manager.preLogRoll(newPath);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void postLogRoll(Path oldPath, Path newPath) throws IOException {
|
||||
manager.postLogRoll(newPath);
|
||||
|
|
|
@ -80,17 +80,18 @@ public class ReplicationSyncUp extends Configured implements Tool {
|
|||
// replication queues for the dead region servers first and then replicate the data out.
|
||||
private void claimReplicationQueues(ZKWatcher zkw, ReplicationSourceManager mgr)
|
||||
throws ReplicationException, KeeperException {
|
||||
List<ServerName> replicators = mgr.getQueueStorage().getListOfReplicators();
|
||||
Set<ServerName> liveRegionServers = getLiveRegionServers(zkw);
|
||||
for (ServerName sn : replicators) {
|
||||
if (!liveRegionServers.contains(sn)) {
|
||||
List<String> replicationQueues = mgr.getQueueStorage().getAllQueues(sn);
|
||||
System.out.println(sn + " is dead, claim its replication queues: " + replicationQueues);
|
||||
for (String queue : replicationQueues) {
|
||||
mgr.claimQueue(sn, queue);
|
||||
}
|
||||
}
|
||||
}
|
||||
// TODO: reimplement this tool
|
||||
// List<ServerName> replicators = mgr.getQueueStorage().getListOfReplicators();
|
||||
// Set<ServerName> liveRegionServers = getLiveRegionServers(zkw);
|
||||
// for (ServerName sn : replicators) {
|
||||
// if (!liveRegionServers.contains(sn)) {
|
||||
// List<String> replicationQueues = mgr.getQueueStorage().getAllQueues(sn);
|
||||
// System.out.println(sn + " is dead, claim its replication queues: " + replicationQueues);
|
||||
// for (String queue : replicationQueues) {
|
||||
// mgr.claimQueue(sn, queue);
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -2558,7 +2558,7 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
}
|
||||
|
||||
private void checkAndFixReplication() throws ReplicationException, IOException {
|
||||
ReplicationChecker checker = new ReplicationChecker(getConf(), zkw, errors);
|
||||
ReplicationChecker checker = new ReplicationChecker(getConf(), zkw, connection, errors);
|
||||
checker.checkUnDeletedQueues();
|
||||
|
||||
if (checker.hasUnDeletedQueues() && this.fixReplication) {
|
||||
|
@ -3831,7 +3831,7 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
return;
|
||||
}
|
||||
ReplicationQueueStorage queueStorage =
|
||||
ReplicationStorageFactory.getReplicationQueueStorage(zkw, getConf());
|
||||
ReplicationStorageFactory.getReplicationQueueStorage(connection, getConf());
|
||||
List<ReplicationPeerDescription> peerDescriptions = admin.listReplicationPeers();
|
||||
if (peerDescriptions != null && peerDescriptions.size() > 0) {
|
||||
List<String> peers = peerDescriptions.stream()
|
||||
|
|
|
@ -27,9 +27,11 @@ import java.util.Set;
|
|||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.client.Connection;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationException;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeerStorage;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueInfo;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueData;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueId;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationStorageFactory;
|
||||
import org.apache.hadoop.hbase.util.HbckErrorReporter;
|
||||
|
@ -48,18 +50,18 @@ public class ReplicationChecker {
|
|||
|
||||
private final HbckErrorReporter errorReporter;
|
||||
// replicator with its queueIds for removed peers
|
||||
private Map<ServerName, List<String>> undeletedQueueIds = new HashMap<>();
|
||||
private Map<ServerName, List<ReplicationQueueId>> undeletedQueueIds = new HashMap<>();
|
||||
// replicator with its undeleted queueIds for removed peers in hfile-refs queue
|
||||
private Set<String> undeletedHFileRefsPeerIds = new HashSet<>();
|
||||
|
||||
private final ReplicationPeerStorage peerStorage;
|
||||
private final ReplicationQueueStorage queueStorage;
|
||||
|
||||
public ReplicationChecker(Configuration conf, ZKWatcher zkw, HbckErrorReporter errorReporter)
|
||||
throws IOException {
|
||||
public ReplicationChecker(Configuration conf, ZKWatcher zkw, Connection conn,
|
||||
HbckErrorReporter errorReporter) throws IOException {
|
||||
this.peerStorage =
|
||||
ReplicationStorageFactory.getReplicationPeerStorage(FileSystem.get(conf), zkw, conf);
|
||||
this.queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(zkw, conf);
|
||||
this.queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(conn, conf);
|
||||
this.errorReporter = errorReporter;
|
||||
}
|
||||
|
||||
|
@ -68,19 +70,19 @@ public class ReplicationChecker {
|
|||
.contains(HbckErrorReporter.ERROR_CODE.UNDELETED_REPLICATION_QUEUE);
|
||||
}
|
||||
|
||||
private Map<ServerName, List<String>> getUnDeletedQueues() throws ReplicationException {
|
||||
Map<ServerName, List<String>> undeletedQueues = new HashMap<>();
|
||||
private Map<ServerName, List<ReplicationQueueId>> getUnDeletedQueues()
|
||||
throws ReplicationException {
|
||||
Map<ServerName, List<ReplicationQueueId>> undeletedQueues = new HashMap<>();
|
||||
Set<String> peerIds = new HashSet<>(peerStorage.listPeerIds());
|
||||
for (ServerName replicator : queueStorage.getListOfReplicators()) {
|
||||
for (String queueId : queueStorage.getAllQueues(replicator)) {
|
||||
ReplicationQueueInfo queueInfo = new ReplicationQueueInfo(queueId);
|
||||
if (!peerIds.contains(queueInfo.getPeerId())) {
|
||||
undeletedQueues.computeIfAbsent(replicator, key -> new ArrayList<>()).add(queueId);
|
||||
LOG.debug(
|
||||
"Undeleted replication queue for removed peer found: "
|
||||
+ "[removedPeerId={}, replicator={}, queueId={}]",
|
||||
queueInfo.getPeerId(), replicator, queueId);
|
||||
}
|
||||
for (ReplicationQueueData queueData : queueStorage.listAllQueues()) {
|
||||
ReplicationQueueId queueId = queueData.getId();
|
||||
if (!peerIds.contains(queueId.getPeerId())) {
|
||||
undeletedQueues.computeIfAbsent(queueId.getServerName(), key -> new ArrayList<>())
|
||||
.add(queueId);
|
||||
LOG.debug(
|
||||
"Undeleted replication queue for removed peer found: "
|
||||
+ "[removedPeerId={}, replicator={}, queueId={}]",
|
||||
queueId.getPeerId(), queueId.getServerName(), queueId);
|
||||
}
|
||||
}
|
||||
return undeletedQueues;
|
||||
|
@ -103,9 +105,8 @@ public class ReplicationChecker {
|
|||
undeletedQueueIds = getUnDeletedQueues();
|
||||
undeletedQueueIds.forEach((replicator, queueIds) -> {
|
||||
queueIds.forEach(queueId -> {
|
||||
ReplicationQueueInfo queueInfo = new ReplicationQueueInfo(queueId);
|
||||
String msg = "Undeleted replication queue for removed peer found: "
|
||||
+ String.format("[removedPeerId=%s, replicator=%s, queueId=%s]", queueInfo.getPeerId(),
|
||||
+ String.format("[removedPeerId=%s, replicator=%s, queueId=%s]", queueId.getPeerId(),
|
||||
replicator, queueId);
|
||||
errorReporter.reportError(HbckErrorReporter.ERROR_CODE.UNDELETED_REPLICATION_QUEUE, msg);
|
||||
});
|
||||
|
@ -118,12 +119,12 @@ public class ReplicationChecker {
|
|||
}
|
||||
|
||||
public void fixUnDeletedQueues() throws ReplicationException {
|
||||
for (Map.Entry<ServerName, List<String>> replicatorAndQueueIds : undeletedQueueIds.entrySet()) {
|
||||
for (Map.Entry<ServerName, List<ReplicationQueueId>> replicatorAndQueueIds : undeletedQueueIds
|
||||
.entrySet()) {
|
||||
ServerName replicator = replicatorAndQueueIds.getKey();
|
||||
for (String queueId : replicatorAndQueueIds.getValue()) {
|
||||
queueStorage.removeQueue(replicator, queueId);
|
||||
for (ReplicationQueueId queueId : replicatorAndQueueIds.getValue()) {
|
||||
queueStorage.removeQueue(queueId);
|
||||
}
|
||||
queueStorage.removeReplicatorIfQueueIsEmpty(replicator);
|
||||
}
|
||||
for (String peerId : undeletedHFileRefsPeerIds) {
|
||||
queueStorage.removePeerFromHFileRefs(peerId);
|
||||
|
|
|
@ -17,6 +17,7 @@
|
|||
*/
|
||||
package org.apache.hadoop.hbase.wal;
|
||||
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
|
@ -28,6 +29,7 @@ import java.util.concurrent.locks.ReentrantReadWriteLock;
|
|||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.Abortable;
|
||||
|
@ -335,6 +337,44 @@ public abstract class AbstractFSWALProvider<T extends AbstractFSWAL<?>> implemen
|
|||
return dirName.toString();
|
||||
}
|
||||
|
||||
/**
|
||||
* List all the old wal files for a dead region server.
|
||||
* <p/>
|
||||
* Initially added for supporting replication, where we need to get the wal files to replicate for
|
||||
* a dead region server.
|
||||
*/
|
||||
public static List<Path> getArchivedWALFiles(Configuration conf, ServerName serverName,
|
||||
String logPrefix) throws IOException {
|
||||
Path walRootDir = CommonFSUtils.getWALRootDir(conf);
|
||||
FileSystem fs = walRootDir.getFileSystem(conf);
|
||||
List<Path> archivedWalFiles = new ArrayList<>();
|
||||
// list both the root old wal dir and the separate old wal dir, so we will not miss any files if
|
||||
// the SEPARATE_OLDLOGDIR config is changed
|
||||
Path oldWalDir = new Path(walRootDir, HConstants.HREGION_OLDLOGDIR_NAME);
|
||||
try {
|
||||
for (FileStatus status : fs.listStatus(oldWalDir, p -> p.getName().startsWith(logPrefix))) {
|
||||
if (status.isFile()) {
|
||||
archivedWalFiles.add(status.getPath());
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
LOG.info("Old WAL dir {} not exists", oldWalDir);
|
||||
return Collections.emptyList();
|
||||
}
|
||||
Path separatedOldWalDir = new Path(oldWalDir, serverName.toString());
|
||||
try {
|
||||
for (FileStatus status : fs.listStatus(separatedOldWalDir,
|
||||
p -> p.getName().startsWith(logPrefix))) {
|
||||
if (status.isFile()) {
|
||||
archivedWalFiles.add(status.getPath());
|
||||
}
|
||||
}
|
||||
} catch (FileNotFoundException e) {
|
||||
LOG.info("Seprated old WAL dir {} not exists", separatedOldWalDir);
|
||||
}
|
||||
return archivedWalFiles;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pulls a ServerName out of a Path generated according to our layout rules. In the below layouts,
|
||||
* this method ignores the format of the logfile component. Current format: [base directory for
|
||||
|
|
|
@ -43,12 +43,12 @@ import org.apache.hadoop.hbase.DoNotRetryIOException;
|
|||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.ReplicationPeerNotFoundException;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationException;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeerConfig;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeerConfigBuilder;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeerDescription;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueData;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationStorageFactory;
|
||||
import org.apache.hadoop.hbase.replication.VerifyWALEntriesReplicationEndpoint;
|
||||
|
@ -102,11 +102,9 @@ public class TestAsyncReplicationAdminApi extends TestAsyncAdminBase {
|
|||
} catch (Exception e) {
|
||||
}
|
||||
ReplicationQueueStorage queueStorage = ReplicationStorageFactory
|
||||
.getReplicationQueueStorage(TEST_UTIL.getZooKeeperWatcher(), TEST_UTIL.getConfiguration());
|
||||
for (ServerName serverName : queueStorage.getListOfReplicators()) {
|
||||
for (String queue : queueStorage.getAllQueues(serverName)) {
|
||||
queueStorage.removeQueue(serverName, queue);
|
||||
}
|
||||
.getReplicationQueueStorage(TEST_UTIL.getConnection(), TEST_UTIL.getConfiguration());
|
||||
for (ReplicationQueueData queueData : queueStorage.listAllQueues()) {
|
||||
queueStorage.removeQueue(queueData.getId());
|
||||
}
|
||||
admin.replicationPeerModificationSwitch(true).join();
|
||||
}
|
||||
|
|
|
@ -151,7 +151,7 @@ public class MockMasterServices extends MockNoopMasterServices {
|
|||
CommonFSUtils.setRootDir(getConfiguration(), rootdir);
|
||||
this.rpm = mock(ReplicationPeerManager.class);
|
||||
ReplicationQueueStorage rqs = mock(ReplicationQueueStorage.class);
|
||||
when(rqs.getAllQueues(any())).thenReturn(Collections.emptyList());
|
||||
when(rqs.listAllQueueIds(any(ServerName.class))).thenReturn(Collections.emptyList());
|
||||
when(rpm.getQueueStorage()).thenReturn(rqs);
|
||||
}
|
||||
|
||||
|
|
|
@ -20,7 +20,6 @@ package org.apache.hadoop.hbase.master.cleaner;
|
|||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.mockito.Mockito.doAnswer;
|
||||
import static org.mockito.Mockito.doThrow;
|
||||
import static org.mockito.Mockito.spy;
|
||||
|
||||
|
@ -45,8 +44,8 @@ import org.apache.hadoop.hbase.HConstants;
|
|||
import org.apache.hadoop.hbase.Server;
|
||||
import org.apache.hadoop.hbase.Waiter;
|
||||
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
|
||||
import org.apache.hadoop.hbase.client.ConnectionFactory;
|
||||
import org.apache.hadoop.hbase.master.HMaster;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationException;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationStorageFactory;
|
||||
import org.apache.hadoop.hbase.replication.master.ReplicationLogCleaner;
|
||||
|
@ -62,13 +61,14 @@ import org.junit.AfterClass;
|
|||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
import org.mockito.invocation.InvocationOnMock;
|
||||
import org.mockito.stubbing.Answer;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
// revisit later after we implement new replication log cleaner
|
||||
@Ignore
|
||||
@Category({ MasterTests.class, MediumTests.class })
|
||||
public class TestLogsCleaner {
|
||||
|
||||
|
@ -131,8 +131,8 @@ public class TestLogsCleaner {
|
|||
|
||||
HMaster.decorateMasterConfiguration(conf);
|
||||
Server server = new DummyServer();
|
||||
ReplicationQueueStorage queueStorage =
|
||||
ReplicationStorageFactory.getReplicationQueueStorage(server.getZooKeeper(), conf);
|
||||
ReplicationQueueStorage queueStorage = ReplicationStorageFactory
|
||||
.getReplicationQueueStorage(ConnectionFactory.createConnection(conf), conf);
|
||||
|
||||
String fakeMachineName =
|
||||
URLEncoder.encode(server.getServerName().toString(), StandardCharsets.UTF_8.name());
|
||||
|
@ -162,7 +162,7 @@ public class TestLogsCleaner {
|
|||
// Case 4: put 3 WALs in ZK indicating that they are scheduled for replication so these
|
||||
// files would pass TimeToLiveLogCleaner but would be rejected by ReplicationLogCleaner
|
||||
if (i % (30 / 3) == 0) {
|
||||
queueStorage.addWAL(server.getServerName(), fakeMachineName, fileName.getName());
|
||||
// queueStorage.addWAL(server.getServerName(), fakeMachineName, fileName.getName());
|
||||
LOG.info("Replication log file: " + fileName);
|
||||
}
|
||||
}
|
||||
|
@ -222,20 +222,20 @@ public class TestLogsCleaner {
|
|||
|
||||
try {
|
||||
faultyZK.init(false);
|
||||
ReplicationQueueStorage queueStorage =
|
||||
spy(ReplicationStorageFactory.getReplicationQueueStorage(faultyZK, conf));
|
||||
doAnswer(new Answer<Object>() {
|
||||
@Override
|
||||
public Object answer(InvocationOnMock invocation) throws Throwable {
|
||||
try {
|
||||
return invocation.callRealMethod();
|
||||
} catch (ReplicationException e) {
|
||||
LOG.debug("Caught Exception", e);
|
||||
getListOfReplicatorsFailed.set(true);
|
||||
throw e;
|
||||
}
|
||||
}
|
||||
}).when(queueStorage).getAllWALs();
|
||||
ReplicationQueueStorage queueStorage = spy(ReplicationStorageFactory
|
||||
.getReplicationQueueStorage(ConnectionFactory.createConnection(conf), conf));
|
||||
// doAnswer(new Answer<Object>() {
|
||||
// @Override
|
||||
// public Object answer(InvocationOnMock invocation) throws Throwable {
|
||||
// try {
|
||||
// return invocation.callRealMethod();
|
||||
// } catch (ReplicationException e) {
|
||||
// LOG.debug("Caught Exception", e);
|
||||
// getListOfReplicatorsFailed.set(true);
|
||||
// throw e;
|
||||
// }
|
||||
// }
|
||||
// }).when(queueStorage).getAllWALs();
|
||||
|
||||
cleaner.setConf(conf, faultyZK, queueStorage);
|
||||
// should keep all files due to a ConnectionLossException getting the queues znodes
|
||||
|
|
|
@ -17,14 +17,12 @@
|
|||
*/
|
||||
package org.apache.hadoop.hbase.master.cleaner;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.junit.Assert.fail;
|
||||
import static org.mockito.Mockito.doThrow;
|
||||
import static org.mockito.Mockito.spy;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.UncheckedIOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Iterator;
|
||||
import java.util.List;
|
||||
|
@ -32,12 +30,11 @@ import org.apache.hadoop.conf.Configuration;
|
|||
import org.apache.hadoop.fs.FileStatus;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.Abortable;
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtil;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.Server;
|
||||
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
|
||||
import org.apache.hadoop.hbase.client.Connection;
|
||||
import org.apache.hadoop.hbase.master.HMaster;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationException;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationFactory;
|
||||
|
@ -49,25 +46,21 @@ import org.apache.hadoop.hbase.replication.SyncReplicationState;
|
|||
import org.apache.hadoop.hbase.replication.master.ReplicationHFileCleaner;
|
||||
import org.apache.hadoop.hbase.testclassification.MasterTests;
|
||||
import org.apache.hadoop.hbase.testclassification.SmallTests;
|
||||
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
|
||||
import org.apache.hadoop.hbase.util.MockServer;
|
||||
import org.apache.hadoop.hbase.util.Pair;
|
||||
import org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
import org.apache.zookeeper.data.Stat;
|
||||
import org.junit.After;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.common.collect.Lists;
|
||||
|
||||
// TODO: revisit later
|
||||
@Ignore
|
||||
@Category({ MasterTests.class, SmallTests.class })
|
||||
public class TestReplicationHFileCleaner {
|
||||
|
||||
|
@ -87,14 +80,14 @@ public class TestReplicationHFileCleaner {
|
|||
|
||||
@BeforeClass
|
||||
public static void setUpBeforeClass() throws Exception {
|
||||
TEST_UTIL.startMiniZKCluster();
|
||||
TEST_UTIL.startMiniCluster();
|
||||
server = new DummyServer();
|
||||
conf.setBoolean(HConstants.REPLICATION_BULKLOAD_ENABLE_KEY, true);
|
||||
HMaster.decorateMasterConfiguration(conf);
|
||||
rp =
|
||||
ReplicationFactory.getReplicationPeers(server.getFileSystem(), server.getZooKeeper(), conf);
|
||||
rp.init();
|
||||
rq = ReplicationStorageFactory.getReplicationQueueStorage(server.getZooKeeper(), conf);
|
||||
rq = ReplicationStorageFactory.getReplicationQueueStorage(server.getConnection(), conf);
|
||||
fs = FileSystem.get(conf);
|
||||
}
|
||||
|
||||
|
@ -109,7 +102,6 @@ public class TestReplicationHFileCleaner {
|
|||
rp.getPeerStorage().addPeer(peerId,
|
||||
ReplicationPeerConfig.newBuilder().setClusterKey(TEST_UTIL.getClusterKey()).build(), true,
|
||||
SyncReplicationState.NONE);
|
||||
rq.addPeerToHFileRefs(peerId);
|
||||
}
|
||||
|
||||
@After
|
||||
|
@ -184,47 +176,6 @@ public class TestReplicationHFileCleaner {
|
|||
assertTrue(deletableFilesIterator.next().getPath().equals(deletablefile));
|
||||
}
|
||||
|
||||
/**
|
||||
* ReplicationHFileCleaner should be able to ride over ZooKeeper errors without aborting.
|
||||
*/
|
||||
@Test
|
||||
public void testZooKeeperAbort() throws Exception {
|
||||
ReplicationHFileCleaner cleaner = new ReplicationHFileCleaner();
|
||||
|
||||
List<FileStatus> dummyFiles = Lists.newArrayList(
|
||||
new FileStatus(100, false, 3, 100, EnvironmentEdgeManager.currentTime(), new Path("hfile1")),
|
||||
new FileStatus(100, false, 3, 100, EnvironmentEdgeManager.currentTime(), new Path("hfile2")));
|
||||
|
||||
FaultyZooKeeperWatcher faultyZK =
|
||||
new FaultyZooKeeperWatcher(conf, "testZooKeeperAbort-faulty", null);
|
||||
try {
|
||||
faultyZK.init();
|
||||
cleaner.setConf(conf, faultyZK);
|
||||
// should keep all files due to a ConnectionLossException getting the queues znodes
|
||||
Iterable<FileStatus> toDelete = cleaner.getDeletableFiles(dummyFiles);
|
||||
assertFalse(toDelete.iterator().hasNext());
|
||||
assertFalse(cleaner.isStopped());
|
||||
} finally {
|
||||
faultyZK.close();
|
||||
}
|
||||
|
||||
// when zk is working both files should be returned
|
||||
cleaner = new ReplicationHFileCleaner();
|
||||
ZKWatcher zkw = new ZKWatcher(conf, "testZooKeeperAbort-normal", null);
|
||||
try {
|
||||
cleaner.setConf(conf, zkw);
|
||||
Iterable<FileStatus> filesToDelete = cleaner.getDeletableFiles(dummyFiles);
|
||||
Iterator<FileStatus> iter = filesToDelete.iterator();
|
||||
assertTrue(iter.hasNext());
|
||||
assertEquals(new Path("hfile1"), iter.next().getPath());
|
||||
assertTrue(iter.hasNext());
|
||||
assertEquals(new Path("hfile2"), iter.next().getPath());
|
||||
assertFalse(iter.hasNext());
|
||||
} finally {
|
||||
zkw.close();
|
||||
}
|
||||
}
|
||||
|
||||
static class DummyServer extends MockServer {
|
||||
|
||||
@Override
|
||||
|
@ -233,13 +184,12 @@ public class TestReplicationHFileCleaner {
|
|||
}
|
||||
|
||||
@Override
|
||||
public ZKWatcher getZooKeeper() {
|
||||
public Connection getConnection() {
|
||||
try {
|
||||
return new ZKWatcher(getConfiguration(), "dummy server", this);
|
||||
return TEST_UTIL.getConnection();
|
||||
} catch (IOException e) {
|
||||
LOG.error("Can not get ZKWatcher", e);
|
||||
throw new UncheckedIOException(e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -247,29 +197,8 @@ public class TestReplicationHFileCleaner {
|
|||
try {
|
||||
return TEST_UTIL.getTestFileSystem();
|
||||
} catch (IOException e) {
|
||||
LOG.error("Can not get FileSystem", e);
|
||||
throw new UncheckedIOException(e);
|
||||
}
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
static class FaultyZooKeeperWatcher extends ZKWatcher {
|
||||
private RecoverableZooKeeper zk;
|
||||
|
||||
public FaultyZooKeeperWatcher(Configuration conf, String identifier, Abortable abortable)
|
||||
throws ZooKeeperConnectionException, IOException {
|
||||
super(conf, identifier, abortable);
|
||||
}
|
||||
|
||||
public void init() throws Exception {
|
||||
this.zk = spy(super.getRecoverableZooKeeper());
|
||||
doThrow(new KeeperException.ConnectionLossException()).when(zk)
|
||||
.getData("/hbase/replication/hfile-refs", null, new Stat());
|
||||
}
|
||||
|
||||
@Override
|
||||
public RecoverableZooKeeper getRecoverableZooKeeper() {
|
||||
return zk;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -114,7 +114,7 @@ public class TestBulkLoadReplicationHFileRefs extends TestReplicationBase {
|
|||
admin1 = UTIL1.getConnection().getAdmin();
|
||||
admin2 = UTIL2.getConnection().getAdmin();
|
||||
|
||||
queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(UTIL1.getZooKeeperWatcher(),
|
||||
queueStorage = ReplicationStorageFactory.getReplicationQueueStorage(UTIL1.getConnection(),
|
||||
UTIL1.getConfiguration());
|
||||
|
||||
admin1.createNamespace(NamespaceDescriptor.create(REPLICATE_NAMESPACE).build());
|
||||
|
|
|
@ -22,6 +22,7 @@ import static org.junit.Assert.assertThrows;
|
|||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.Collections;
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtil;
|
||||
import org.apache.hadoop.hbase.ReplicationPeerNotFoundException;
|
||||
|
@ -29,9 +30,13 @@ import org.apache.hadoop.hbase.ServerName;
|
|||
import org.apache.hadoop.hbase.SingleProcessHBaseCluster;
|
||||
import org.apache.hadoop.hbase.master.HMaster;
|
||||
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
|
||||
import org.apache.hadoop.hbase.master.replication.ReplicationPeerManager;
|
||||
import org.apache.hadoop.hbase.procedure2.Procedure;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationGroupOffset;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeerConfig;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueId;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationUtils;
|
||||
import org.apache.hadoop.hbase.replication.regionserver.ReplicationSourceInterface;
|
||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||
import org.apache.hadoop.hbase.testclassification.RegionServerTests;
|
||||
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
|
||||
|
@ -57,6 +62,10 @@ public class TestStartupWithLegacyRegionReplicationEndpoint {
|
|||
@BeforeClass
|
||||
public static void setUp() throws Exception {
|
||||
UTIL.startMiniCluster(1);
|
||||
// add a peer to force initialize the replication storage
|
||||
UTIL.getAdmin().addReplicationPeer("1", ReplicationPeerConfig.newBuilder()
|
||||
.setClusterKey(UTIL.getZkCluster().getAddress().toString() + ":/1").build());
|
||||
UTIL.getAdmin().removeReplicationPeer("1");
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
|
@ -66,40 +75,42 @@ public class TestStartupWithLegacyRegionReplicationEndpoint {
|
|||
|
||||
@Test
|
||||
public void test() throws Exception {
|
||||
String peerId = "legacy";
|
||||
ReplicationPeerConfig peerConfig = ReplicationPeerConfig.newBuilder()
|
||||
.setClusterKey("127.0.0.1:2181:/hbase")
|
||||
.setReplicationEndpointImpl(ReplicationUtils.LEGACY_REGION_REPLICATION_ENDPOINT_NAME).build();
|
||||
SingleProcessHBaseCluster cluster = UTIL.getMiniHBaseCluster();
|
||||
HMaster master = cluster.getMaster();
|
||||
// can not use Admin.addPeer as it will fail with ClassNotFound
|
||||
master.getReplicationPeerManager().addPeer("legacy", peerConfig, true);
|
||||
master.getReplicationPeerManager().addPeer(peerId, peerConfig, true);
|
||||
// add a wal file to the queue
|
||||
ServerName rsName = cluster.getRegionServer(0).getServerName();
|
||||
master.getReplicationPeerManager().getQueueStorage().addWAL(rsName,
|
||||
ServerRegionReplicaUtil.REGION_REPLICA_REPLICATION_PEER, "test-wal-file");
|
||||
master.getReplicationPeerManager().getQueueStorage().setOffset(
|
||||
new ReplicationQueueId(rsName, ServerRegionReplicaUtil.REGION_REPLICA_REPLICATION_PEER), "",
|
||||
new ReplicationGroupOffset("test-wal-file", 0), Collections.emptyMap());
|
||||
cluster.stopRegionServer(0);
|
||||
RegionServerThread rst = cluster.startRegionServer();
|
||||
// we should still have this peer
|
||||
assertNotNull(UTIL.getAdmin().getReplicationPeerConfig("legacy"));
|
||||
assertNotNull(UTIL.getAdmin().getReplicationPeerConfig(peerId));
|
||||
// but at RS side, we should not have this peer loaded as replication source
|
||||
assertTrue(rst.getRegionServer().getReplicationSourceService().getReplicationManager()
|
||||
.getSources().isEmpty());
|
||||
assertTrue(
|
||||
rst.getRegionServer().getReplicationSourceService().getReplicationManager().getSources()
|
||||
.stream().map(ReplicationSourceInterface::getPeerId).noneMatch(p -> p.equals(peerId)));
|
||||
|
||||
UTIL.shutdownMiniHBaseCluster();
|
||||
UTIL.restartHBaseCluster(1);
|
||||
// now we should have removed the peer
|
||||
assertThrows(ReplicationPeerNotFoundException.class,
|
||||
() -> UTIL.getAdmin().getReplicationPeerConfig("legacy"));
|
||||
// at rs side, we should not have the peer this time, not only for not having replication source
|
||||
assertTrue(UTIL.getMiniHBaseCluster().getRegionServer(0).getReplicationSourceService()
|
||||
.getReplicationManager().getReplicationPeers().getAllPeerIds().isEmpty());
|
||||
|
||||
// make sure that we can finish the SCP and delete the test-wal-file
|
||||
// make sure that we can finish the SCP
|
||||
UTIL.waitFor(15000,
|
||||
() -> UTIL.getMiniHBaseCluster().getMaster().getProcedures().stream()
|
||||
.filter(p -> p instanceof ServerCrashProcedure).map(p -> (ServerCrashProcedure) p)
|
||||
.allMatch(Procedure::isSuccess));
|
||||
assertTrue(UTIL.getMiniHBaseCluster().getMaster().getReplicationPeerManager().getQueueStorage()
|
||||
.getAllQueues(rsName).isEmpty());
|
||||
// the deletion is async, so wait until they get deleted
|
||||
ReplicationPeerManager ppm = UTIL.getMiniHBaseCluster().getMaster().getReplicationPeerManager();
|
||||
UTIL.waitFor(15000, () -> !ppm.getPeerStorage().listPeerIds().contains(peerId)
|
||||
&& ppm.getQueueStorage().listAllQueueIds(peerId, rsName).isEmpty());
|
||||
}
|
||||
}
|
||||
|
|
|
@ -41,7 +41,7 @@ public class ReplicationSourceDummy implements ReplicationSourceInterface {
|
|||
|
||||
private ReplicationSourceManager manager;
|
||||
private ReplicationPeer replicationPeer;
|
||||
private String peerClusterId;
|
||||
private ReplicationQueueId queueId;
|
||||
private Path currentPath;
|
||||
private MetricsSource metrics;
|
||||
private WALFileLengthProvider walFileLengthProvider;
|
||||
|
@ -49,11 +49,11 @@ public class ReplicationSourceDummy implements ReplicationSourceInterface {
|
|||
|
||||
@Override
|
||||
public void init(Configuration conf, FileSystem fs, ReplicationSourceManager manager,
|
||||
ReplicationQueueStorage rq, ReplicationPeer rp, Server server, String peerClusterId,
|
||||
ReplicationQueueStorage rq, ReplicationPeer rp, Server server, ReplicationQueueData queueData,
|
||||
UUID clusterId, WALFileLengthProvider walFileLengthProvider, MetricsSource metrics)
|
||||
throws IOException {
|
||||
this.manager = manager;
|
||||
this.peerClusterId = peerClusterId;
|
||||
this.queueId = queueData.getId();
|
||||
this.metrics = metrics;
|
||||
this.walFileLengthProvider = walFileLengthProvider;
|
||||
this.replicationPeer = rp;
|
||||
|
@ -100,14 +100,13 @@ public class ReplicationSourceDummy implements ReplicationSourceInterface {
|
|||
}
|
||||
|
||||
@Override
|
||||
public String getQueueId() {
|
||||
return peerClusterId;
|
||||
public ReplicationQueueId getQueueId() {
|
||||
return queueId;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getPeerId() {
|
||||
String[] parts = peerClusterId.split("-", 2);
|
||||
return parts.length != 1 ? parts[0] : peerClusterId;
|
||||
return queueId.getPeerId();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -79,8 +79,9 @@ public class TestAddToSerialReplicationPeer extends SerialReplicationTestBase {
|
|||
ReplicationSourceManager manager =
|
||||
((Replication) rs.getReplicationSourceService()).getReplicationManager();
|
||||
// Make sure replication moves to the new file.
|
||||
return (manager.getWALs().get(PEER_ID).get(logPrefix).size() == 1)
|
||||
&& !oldWalName.equals(manager.getWALs().get(PEER_ID).get(logPrefix).first());
|
||||
ReplicationQueueId queueId = new ReplicationQueueId(rs.getServerName(), PEER_ID);
|
||||
return (manager.getWALs().get(queueId).get(logPrefix).size() == 1)
|
||||
&& !oldWalName.equals(manager.getWALs().get(queueId).get(logPrefix).first());
|
||||
}
|
||||
|
||||
@Override
|
||||
|
|
|
@ -31,7 +31,7 @@ import org.apache.hadoop.hbase.master.MasterServices;
|
|||
import org.apache.hadoop.hbase.master.RegionServerList;
|
||||
import org.apache.hadoop.hbase.master.ServerManager;
|
||||
import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure;
|
||||
import org.apache.hadoop.hbase.master.replication.ClaimReplicationQueuesProcedure;
|
||||
import org.apache.hadoop.hbase.master.replication.AssignReplicationQueuesProcedure;
|
||||
import org.apache.hadoop.hbase.procedure2.Procedure;
|
||||
import org.apache.hadoop.hbase.testclassification.LargeTests;
|
||||
import org.apache.hadoop.hbase.testclassification.ReplicationTests;
|
||||
|
@ -47,7 +47,7 @@ import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos.Procedu
|
|||
|
||||
/**
|
||||
* In HBASE-26029, we reimplement the claim queue operation with proc-v2 and make it a step in SCP,
|
||||
* this is a UT to make sure the {@link ClaimReplicationQueuesProcedure} works correctly.
|
||||
* this is a UT to make sure the {@link AssignReplicationQueuesProcedure} works correctly.
|
||||
*/
|
||||
@Category({ ReplicationTests.class, LargeTests.class })
|
||||
public class TestClaimReplicationQueue extends TestReplicationBase {
|
||||
|
@ -77,7 +77,7 @@ public class TestClaimReplicationQueue extends TestReplicationBase {
|
|||
// return no region server to make the procedure hang
|
||||
if (EMPTY) {
|
||||
for (StackTraceElement e : Thread.currentThread().getStackTrace()) {
|
||||
if (e.getClassName().equals(ClaimReplicationQueuesProcedure.class.getName())) {
|
||||
if (e.getClassName().equals(AssignReplicationQueuesProcedure.class.getName())) {
|
||||
return Collections.emptyList();
|
||||
}
|
||||
}
|
||||
|
@ -149,7 +149,7 @@ public class TestClaimReplicationQueue extends TestReplicationBase {
|
|||
HMaster master = UTIL1.getMiniHBaseCluster().getMaster();
|
||||
UTIL1.waitFor(30000,
|
||||
() -> master.getProcedures().stream()
|
||||
.filter(p -> p instanceof ClaimReplicationQueuesProcedure)
|
||||
.filter(p -> p instanceof AssignReplicationQueuesProcedure)
|
||||
.anyMatch(p -> p.getState() == ProcedureState.WAITING_TIMEOUT));
|
||||
|
||||
hbaseAdmin.enableReplicationPeer(PEER_ID2);
|
||||
|
|
|
@ -259,8 +259,8 @@ public final class TestNamespaceReplicationWithBulkLoadedData extends TestBulkLo
|
|||
MiniZooKeeperCluster zkCluster = UTIL1.getZkCluster();
|
||||
ZKWatcher watcher = new ZKWatcher(UTIL1.getConfiguration(), "TestZnodeHFiles-refs", null);
|
||||
RecoverableZooKeeper zk = RecoverableZooKeeper.connect(UTIL1.getConfiguration(), watcher);
|
||||
ZKReplicationQueueStorage replicationQueueStorage =
|
||||
new ZKReplicationQueueStorage(watcher, UTIL1.getConfiguration());
|
||||
ReplicationQueueStorage replicationQueueStorage = ReplicationStorageFactory
|
||||
.getReplicationQueueStorage(UTIL1.getConnection(), UTIL1.getConfiguration());
|
||||
Set<String> hfiles = replicationQueueStorage.getAllHFileRefs();
|
||||
assertTrue(hfiles.isEmpty());
|
||||
}
|
||||
|
|
|
@ -332,7 +332,6 @@ public class TestReplicationEmptyWALRecovery extends TestReplicationBase {
|
|||
for (int i = 0; i < numRs; i++) {
|
||||
HRegionServer hrs = UTIL1.getHBaseCluster().getRegionServer(i);
|
||||
Replication replicationService = (Replication) hrs.getReplicationSourceService();
|
||||
replicationService.getReplicationManager().preLogRoll(emptyWalPaths.get(i));
|
||||
replicationService.getReplicationManager().postLogRoll(emptyWalPaths.get(i));
|
||||
RegionInfo regionInfo =
|
||||
UTIL1.getHBaseCluster().getRegions(htable1.getName()).get(0).getRegionInfo();
|
||||
|
|
|
@ -32,11 +32,14 @@ import org.apache.hadoop.hbase.testclassification.LargeTests;
|
|||
import org.apache.hadoop.hbase.testclassification.ReplicationTests;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
// revisit later when we implement the new ReplicationSyncUpTool
|
||||
@Ignore
|
||||
@Category({ ReplicationTests.class, LargeTests.class })
|
||||
public class TestReplicationSyncUpTool extends TestReplicationSyncUpToolBase {
|
||||
|
||||
|
|
|
@ -45,11 +45,14 @@ import org.apache.hadoop.hbase.tool.BulkLoadHFiles;
|
|||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.util.HFileTestUtil;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
//revisit later when we implement the new ReplicationSyncUpTool
|
||||
@Ignore
|
||||
@Category({ ReplicationTests.class, LargeTests.class })
|
||||
public class TestReplicationSyncUpToolWithBulkLoadedData extends TestReplicationSyncUpToolBase {
|
||||
|
||||
|
|
|
@ -32,9 +32,12 @@ import org.apache.hadoop.hbase.util.CommonFSUtils.StreamLacksCapabilityException
|
|||
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
|
||||
import org.junit.Before;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
|
||||
// revisit later when we reviewing the implementation for serial replication
|
||||
@Ignore
|
||||
@Category({ ReplicationTests.class, MediumTests.class })
|
||||
public class TestSerialReplicationFailover extends SerialReplicationTestBase {
|
||||
|
||||
|
|
|
@ -0,0 +1,423 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.replication;
|
||||
|
||||
import static org.hamcrest.MatcherAssert.assertThat;
|
||||
import static org.hamcrest.Matchers.hasItem;
|
||||
import static org.hamcrest.Matchers.hasSize;
|
||||
import static org.hamcrest.Matchers.not;
|
||||
import static org.junit.Assert.assertEquals;
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.concurrent.ConcurrentHashMap;
|
||||
import java.util.stream.Collectors;
|
||||
import java.util.stream.IntStream;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtil;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.TableNameTestRule;
|
||||
import org.apache.hadoop.hbase.client.TableDescriptor;
|
||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||
import org.apache.hadoop.hbase.testclassification.ReplicationTests;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.util.MD5Hash;
|
||||
import org.apache.hadoop.hbase.util.Pair;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
import org.hamcrest.Matchers;
|
||||
import org.hamcrest.collection.IsEmptyCollection;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Rule;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap;
|
||||
import org.apache.hbase.thirdparty.com.google.common.collect.Iterables;
|
||||
|
||||
@Category({ ReplicationTests.class, MediumTests.class })
|
||||
public class TestTableReplicationQueueStorage {
|
||||
|
||||
@ClassRule
|
||||
public static final HBaseClassTestRule CLASS_RULE =
|
||||
HBaseClassTestRule.forClass(TestTableReplicationQueueStorage.class);
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(TestTableReplicationQueueStorage.class);
|
||||
|
||||
private static final HBaseTestingUtil UTIL = new HBaseTestingUtil();
|
||||
|
||||
@Rule
|
||||
public TableNameTestRule tableNameRule = new TableNameTestRule();
|
||||
|
||||
private TableReplicationQueueStorage storage;
|
||||
|
||||
@BeforeClass
|
||||
public static void setUp() throws Exception {
|
||||
UTIL.startMiniCluster();
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
public static void tearDown() throws IOException {
|
||||
UTIL.shutdownMiniCluster();
|
||||
}
|
||||
|
||||
@Before
|
||||
public void setUpBeforeTest() throws Exception {
|
||||
TableName tableName = tableNameRule.getTableName();
|
||||
TableDescriptor td = ReplicationStorageFactory.createReplicationQueueTableDescriptor(tableName);
|
||||
UTIL.getAdmin().createTable(td);
|
||||
UTIL.waitTableAvailable(tableName);
|
||||
storage = new TableReplicationQueueStorage(UTIL.getConnection(), tableName);
|
||||
}
|
||||
|
||||
private ServerName getServerName(int i) {
|
||||
return ServerName.valueOf("127.0.0.1", 8000 + i, 10000 + i);
|
||||
}
|
||||
|
||||
private String getFileName(String base, int i) {
|
||||
return String.format(base + "-%04d", i);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testReplicator() throws ReplicationException {
|
||||
assertTrue(storage.listAllReplicators().isEmpty());
|
||||
String peerId = "1";
|
||||
for (int i = 0; i < 10; i++) {
|
||||
ReplicationQueueId queueId = new ReplicationQueueId(getServerName(i), peerId);
|
||||
storage.setOffset(queueId, "group-" + i, new ReplicationGroupOffset("file-" + i, i * 100),
|
||||
Collections.emptyMap());
|
||||
}
|
||||
List<ServerName> replicators = storage.listAllReplicators();
|
||||
assertEquals(10, replicators.size());
|
||||
for (int i = 0; i < 10; i++) {
|
||||
assertThat(replicators, hasItem(getServerName(i)));
|
||||
}
|
||||
for (int i = 0; i < 5; i++) {
|
||||
ReplicationQueueId queueId = new ReplicationQueueId(getServerName(i), peerId);
|
||||
storage.removeQueue(queueId);
|
||||
}
|
||||
replicators = storage.listAllReplicators();
|
||||
assertEquals(5, replicators.size());
|
||||
for (int i = 0; i < 5; i++) {
|
||||
assertThat(replicators, not(hasItem(getServerName(i))));
|
||||
}
|
||||
for (int i = 5; i < 10; i++) {
|
||||
assertThat(replicators, hasItem(getServerName(i)));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testGetSetOffset() {
|
||||
|
||||
}
|
||||
|
||||
private void assertQueueId(String peerId, ServerName serverName, ReplicationQueueId queueId) {
|
||||
assertEquals(peerId, queueId.getPeerId());
|
||||
assertEquals(serverName, queueId.getServerName());
|
||||
assertFalse(queueId.getSourceServerName().isPresent());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testPersistLogPositionAndSeqIdAtomically() throws Exception {
|
||||
ServerName serverName1 = ServerName.valueOf("127.0.0.1", 8000, 10000);
|
||||
assertTrue(storage.listAllQueueIds(serverName1).isEmpty());
|
||||
String peerId1 = "1";
|
||||
String region0 = "6b2c8f8555335cc9af74455b94516cbe";
|
||||
String region1 = "6ecd2e9e010499f8ddef97ee8f70834f";
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
ReplicationQueueId queueId = new ReplicationQueueId(serverName1, peerId1);
|
||||
assertTrue(storage.getOffsets(queueId).isEmpty());
|
||||
}
|
||||
assertEquals(HConstants.NO_SEQNUM, storage.getLastSequenceId(region0, peerId1));
|
||||
assertEquals(HConstants.NO_SEQNUM, storage.getLastSequenceId(region1, peerId1));
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
ReplicationQueueId queueId = new ReplicationQueueId(serverName1, peerId1);
|
||||
storage.setOffset(queueId, "group1-" + i,
|
||||
new ReplicationGroupOffset(getFileName("file1", i), (i + 1) * 100),
|
||||
ImmutableMap.of(region0, i * 100L, region1, (i + 1) * 100L));
|
||||
}
|
||||
|
||||
List<ReplicationQueueId> queueIds = storage.listAllQueueIds(serverName1);
|
||||
assertEquals(1, queueIds.size());
|
||||
assertQueueId(peerId1, serverName1, queueIds.get(0));
|
||||
|
||||
Map<String, ReplicationGroupOffset> offsets =
|
||||
storage.getOffsets(new ReplicationQueueId(serverName1, peerId1));
|
||||
for (int i = 0; i < 10; i++) {
|
||||
ReplicationGroupOffset offset = offsets.get("group1-" + i);
|
||||
assertEquals(getFileName("file1", i), offset.getWal());
|
||||
assertEquals((i + 1) * 100, offset.getOffset());
|
||||
}
|
||||
assertEquals(900L, storage.getLastSequenceId(region0, peerId1));
|
||||
assertEquals(1000L, storage.getLastSequenceId(region1, peerId1));
|
||||
|
||||
// Try to decrease the last pushed id by setWALPosition method.
|
||||
storage.setOffset(new ReplicationQueueId(serverName1, peerId1), "group1-0",
|
||||
new ReplicationGroupOffset(getFileName("file1", 0), 11 * 100),
|
||||
ImmutableMap.of(region0, 899L, region1, 1001L));
|
||||
assertEquals(900L, storage.getLastSequenceId(region0, peerId1));
|
||||
assertEquals(1001L, storage.getLastSequenceId(region1, peerId1));
|
||||
}
|
||||
|
||||
private void assertGroupOffset(String wal, long offset, ReplicationGroupOffset groupOffset) {
|
||||
assertEquals(wal, groupOffset.getWal());
|
||||
assertEquals(offset, groupOffset.getOffset());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testClaimQueue() throws Exception {
|
||||
String peerId = "1";
|
||||
ServerName serverName1 = getServerName(1);
|
||||
ReplicationQueueId queueId = new ReplicationQueueId(serverName1, peerId);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
storage.setOffset(queueId, "group-" + i, new ReplicationGroupOffset("wal-" + i, i),
|
||||
Collections.emptyMap());
|
||||
}
|
||||
|
||||
ServerName serverName2 = getServerName(2);
|
||||
Map<String, ReplicationGroupOffset> offsets2 = storage.claimQueue(queueId, serverName2);
|
||||
assertEquals(10, offsets2.size());
|
||||
for (int i = 0; i < 10; i++) {
|
||||
assertGroupOffset("wal-" + i, i, offsets2.get("group-" + i));
|
||||
}
|
||||
ReplicationQueueId claimedQueueId2 = new ReplicationQueueId(serverName2, peerId, serverName1);
|
||||
assertThat(storage.listAllQueueIds(peerId, serverName1), IsEmptyCollection.empty());
|
||||
assertThat(storage.listAllQueueIds(peerId, serverName2),
|
||||
Matchers.<List<ReplicationQueueId>> both(hasItem(claimedQueueId2)).and(hasSize(1)));
|
||||
offsets2 = storage.getOffsets(claimedQueueId2);
|
||||
assertEquals(10, offsets2.size());
|
||||
for (int i = 0; i < 10; i++) {
|
||||
assertGroupOffset("wal-" + i, i, offsets2.get("group-" + i));
|
||||
}
|
||||
|
||||
ServerName serverName3 = getServerName(3);
|
||||
Map<String, ReplicationGroupOffset> offsets3 = storage.claimQueue(claimedQueueId2, serverName3);
|
||||
assertEquals(10, offsets3.size());
|
||||
for (int i = 0; i < 10; i++) {
|
||||
assertGroupOffset("wal-" + i, i, offsets3.get("group-" + i));
|
||||
}
|
||||
ReplicationQueueId claimedQueueId3 = new ReplicationQueueId(serverName3, peerId, serverName1);
|
||||
assertThat(storage.listAllQueueIds(peerId, serverName1), IsEmptyCollection.empty());
|
||||
assertThat(storage.listAllQueueIds(peerId, serverName2), IsEmptyCollection.empty());
|
||||
assertThat(storage.listAllQueueIds(peerId, serverName3),
|
||||
Matchers.<List<ReplicationQueueId>> both(hasItem(claimedQueueId3)).and(hasSize(1)));
|
||||
offsets3 = storage.getOffsets(claimedQueueId3);
|
||||
assertEquals(10, offsets3.size());
|
||||
for (int i = 0; i < 10; i++) {
|
||||
assertGroupOffset("wal-" + i, i, offsets3.get("group-" + i));
|
||||
}
|
||||
storage.removeQueue(claimedQueueId3);
|
||||
assertThat(storage.listAllQueueIds(peerId), IsEmptyCollection.empty());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testClaimQueueMultiThread() throws Exception {
|
||||
String peerId = "3";
|
||||
String walGroup = "group";
|
||||
ReplicationGroupOffset groupOffset = new ReplicationGroupOffset("wal", 123);
|
||||
ServerName sourceServerName = getServerName(100);
|
||||
ReplicationQueueId queueId = new ReplicationQueueId(sourceServerName, peerId);
|
||||
storage.setOffset(queueId, walGroup, groupOffset, Collections.emptyMap());
|
||||
List<ServerName> serverNames =
|
||||
IntStream.range(0, 10).mapToObj(this::getServerName).collect(Collectors.toList());
|
||||
for (int i = 0; i < 10; i++) {
|
||||
final ReplicationQueueId toClaim = queueId;
|
||||
List<Thread> threads = new ArrayList<>();
|
||||
Map<ServerName, Map<String, ReplicationGroupOffset>> claimed = new ConcurrentHashMap<>();
|
||||
Set<ServerName> failed = ConcurrentHashMap.newKeySet();
|
||||
for (ServerName serverName : serverNames) {
|
||||
if (serverName.equals(queueId.getServerName())) {
|
||||
continue;
|
||||
}
|
||||
threads.add(new Thread("Claim-" + i + "-" + serverName) {
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
try {
|
||||
Map<String, ReplicationGroupOffset> offsets = storage.claimQueue(toClaim, serverName);
|
||||
if (!offsets.isEmpty()) {
|
||||
claimed.put(serverName, offsets);
|
||||
}
|
||||
} catch (ReplicationException e) {
|
||||
LOG.error("failed to claim queue", e);
|
||||
failed.add(serverName);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
LOG.info("Claim round {}, there are {} threads to claim {}", i, threads.size(), toClaim);
|
||||
for (Thread thread : threads) {
|
||||
thread.start();
|
||||
}
|
||||
for (Thread thread : threads) {
|
||||
thread.join(30000);
|
||||
assertFalse(thread.isAlive());
|
||||
}
|
||||
LOG.info("Finish claim round {}, claimed={}, failed={}", i, claimed, failed);
|
||||
assertThat(failed, IsEmptyCollection.empty());
|
||||
assertEquals(1, claimed.size());
|
||||
Map<String, ReplicationGroupOffset> offsets = Iterables.getOnlyElement(claimed.values());
|
||||
assertEquals(1, offsets.size());
|
||||
assertGroupOffset("wal", 123, offsets.get("group"));
|
||||
queueId = new ReplicationQueueId(Iterables.getOnlyElement(claimed.keySet()), peerId,
|
||||
sourceServerName);
|
||||
assertThat(storage.listAllQueueIds(peerId),
|
||||
Matchers.<List<ReplicationQueueId>> both(hasItem(queueId)).and(hasSize(1)));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testListRemovePeerAllQueues() throws Exception {
|
||||
String peerId1 = "1";
|
||||
String peerId2 = "2";
|
||||
for (int i = 0; i < 100; i++) {
|
||||
ServerName serverName = getServerName(i);
|
||||
String group = "group";
|
||||
ReplicationGroupOffset offset = new ReplicationGroupOffset("wal", i);
|
||||
ReplicationQueueId queueId1 = new ReplicationQueueId(serverName, peerId1);
|
||||
ReplicationQueueId queueId2 = new ReplicationQueueId(serverName, peerId2);
|
||||
storage.setOffset(queueId1, group, offset, Collections.emptyMap());
|
||||
storage.setOffset(queueId2, group, offset, Collections.emptyMap());
|
||||
}
|
||||
List<ReplicationQueueData> queueDatas = storage.listAllQueues();
|
||||
assertThat(queueDatas, hasSize(200));
|
||||
for (int i = 0; i < 100; i++) {
|
||||
ReplicationQueueData peerId1Data = queueDatas.get(i);
|
||||
ReplicationQueueData peerId2Data = queueDatas.get(i + 100);
|
||||
ServerName serverName = getServerName(i);
|
||||
assertEquals(new ReplicationQueueId(serverName, peerId1), peerId1Data.getId());
|
||||
assertEquals(new ReplicationQueueId(serverName, peerId2), peerId2Data.getId());
|
||||
assertEquals(1, peerId1Data.getOffsets().size());
|
||||
assertEquals(1, peerId2Data.getOffsets().size());
|
||||
assertGroupOffset("wal", i, peerId1Data.getOffsets().get("group"));
|
||||
assertGroupOffset("wal", i, peerId2Data.getOffsets().get("group"));
|
||||
}
|
||||
List<ReplicationQueueId> queueIds1 = storage.listAllQueueIds(peerId1);
|
||||
assertThat(queueIds1, hasSize(100));
|
||||
for (int i = 0; i < 100; i++) {
|
||||
ServerName serverName = getServerName(i);
|
||||
assertEquals(new ReplicationQueueId(serverName, peerId1), queueIds1.get(i));
|
||||
}
|
||||
List<ReplicationQueueId> queueIds2 = storage.listAllQueueIds(peerId2);
|
||||
assertThat(queueIds2, hasSize(100));
|
||||
for (int i = 0; i < 100; i++) {
|
||||
ServerName serverName = getServerName(i);
|
||||
assertEquals(new ReplicationQueueId(serverName, peerId2), queueIds2.get(i));
|
||||
}
|
||||
|
||||
storage.removeAllQueues(peerId1);
|
||||
assertThat(storage.listAllQueues(), hasSize(100));
|
||||
assertThat(storage.listAllQueueIds(peerId1), IsEmptyCollection.empty());
|
||||
assertThat(storage.listAllQueueIds(peerId2), hasSize(100));
|
||||
|
||||
storage.removeAllQueues(peerId2);
|
||||
assertThat(storage.listAllQueues(), IsEmptyCollection.empty());
|
||||
assertThat(storage.listAllQueueIds(peerId1), IsEmptyCollection.empty());
|
||||
assertThat(storage.listAllQueueIds(peerId2), IsEmptyCollection.empty());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRemoveAllLastPushedSeqIdsForPeer() throws Exception {
|
||||
String peerId = "1";
|
||||
String peerIdToDelete = "2";
|
||||
for (int i = 0; i < 100; i++) {
|
||||
String encodedRegionName = MD5Hash.getMD5AsHex(Bytes.toBytes(i));
|
||||
storage.setLastSequenceIds(peerId, ImmutableMap.of(encodedRegionName, (long) i));
|
||||
storage.setLastSequenceIds(peerIdToDelete, ImmutableMap.of(encodedRegionName, (long) i));
|
||||
}
|
||||
for (int i = 0; i < 100; i++) {
|
||||
String encodedRegionName = MD5Hash.getMD5AsHex(Bytes.toBytes(i));
|
||||
assertEquals(i, storage.getLastSequenceId(encodedRegionName, peerId));
|
||||
assertEquals(i, storage.getLastSequenceId(encodedRegionName, peerIdToDelete));
|
||||
}
|
||||
storage.removeLastSequenceIds(peerIdToDelete);
|
||||
for (int i = 0; i < 100; i++) {
|
||||
String encodedRegionName = MD5Hash.getMD5AsHex(Bytes.toBytes(i));
|
||||
assertEquals(i, storage.getLastSequenceId(encodedRegionName, peerId));
|
||||
assertEquals(HConstants.NO_SEQNUM,
|
||||
storage.getLastSequenceId(encodedRegionName, peerIdToDelete));
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testHfileRefsReplicationQueues() throws ReplicationException, KeeperException {
|
||||
String peerId1 = "1";
|
||||
|
||||
List<Pair<Path, Path>> files1 = new ArrayList<>(3);
|
||||
files1.add(new Pair<>(null, new Path("file_1")));
|
||||
files1.add(new Pair<>(null, new Path("file_2")));
|
||||
files1.add(new Pair<>(null, new Path("file_3")));
|
||||
assertTrue(storage.getReplicableHFiles(peerId1).isEmpty());
|
||||
assertEquals(0, storage.getAllPeersFromHFileRefsQueue().size());
|
||||
|
||||
storage.addHFileRefs(peerId1, files1);
|
||||
assertEquals(1, storage.getAllPeersFromHFileRefsQueue().size());
|
||||
assertEquals(3, storage.getReplicableHFiles(peerId1).size());
|
||||
List<String> hfiles2 = new ArrayList<>(files1.size());
|
||||
for (Pair<Path, Path> p : files1) {
|
||||
hfiles2.add(p.getSecond().getName());
|
||||
}
|
||||
String removedString = hfiles2.remove(0);
|
||||
storage.removeHFileRefs(peerId1, hfiles2);
|
||||
assertEquals(1, storage.getReplicableHFiles(peerId1).size());
|
||||
hfiles2 = new ArrayList<>(1);
|
||||
hfiles2.add(removedString);
|
||||
storage.removeHFileRefs(peerId1, hfiles2);
|
||||
assertEquals(0, storage.getReplicableHFiles(peerId1).size());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testRemovePeerForHFileRefs() throws ReplicationException, KeeperException {
|
||||
String peerId1 = "1";
|
||||
String peerId2 = "2";
|
||||
|
||||
List<Pair<Path, Path>> files1 = new ArrayList<>(3);
|
||||
files1.add(new Pair<>(null, new Path("file_1")));
|
||||
files1.add(new Pair<>(null, new Path("file_2")));
|
||||
files1.add(new Pair<>(null, new Path("file_3")));
|
||||
storage.addHFileRefs(peerId1, files1);
|
||||
storage.addHFileRefs(peerId2, files1);
|
||||
assertEquals(2, storage.getAllPeersFromHFileRefsQueue().size());
|
||||
assertEquals(3, storage.getReplicableHFiles(peerId1).size());
|
||||
assertEquals(3, storage.getReplicableHFiles(peerId2).size());
|
||||
|
||||
storage.removePeerFromHFileRefs(peerId1);
|
||||
assertEquals(1, storage.getAllPeersFromHFileRefsQueue().size());
|
||||
assertTrue(storage.getReplicableHFiles(peerId1).isEmpty());
|
||||
assertEquals(3, storage.getReplicableHFiles(peerId2).size());
|
||||
|
||||
storage.removePeerFromHFileRefs(peerId2);
|
||||
assertEquals(0, storage.getAllPeersFromHFileRefsQueue().size());
|
||||
assertTrue(storage.getReplicableHFiles(peerId2).isEmpty());
|
||||
}
|
||||
}
|
|
@ -25,8 +25,11 @@ import org.apache.hadoop.hbase.testclassification.ReplicationTests;
|
|||
import org.apache.hadoop.hbase.wal.RegionGroupingProvider;
|
||||
import org.apache.hadoop.hbase.wal.WALFactory;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.experimental.categories.Category;
|
||||
|
||||
//revisit later when we implement the new ReplicationSyncUpTool
|
||||
@Ignore
|
||||
@Category({ ReplicationTests.class, LargeTests.class })
|
||||
public class TestReplicationSyncUpToolWithMultipleAsyncWAL extends TestReplicationSyncUpTool {
|
||||
|
||||
|
|
|
@ -25,8 +25,11 @@ import org.apache.hadoop.hbase.testclassification.ReplicationTests;
|
|||
import org.apache.hadoop.hbase.wal.RegionGroupingProvider;
|
||||
import org.apache.hadoop.hbase.wal.WALFactory;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.experimental.categories.Category;
|
||||
|
||||
//revisit later when we implement the new ReplicationSyncUpTool
|
||||
@Ignore
|
||||
@Category({ ReplicationTests.class, LargeTests.class })
|
||||
public class TestReplicationSyncUpToolWithMultipleWAL extends TestReplicationSyncUpTool {
|
||||
|
||||
|
|
|
@ -35,9 +35,12 @@ import org.apache.hadoop.hbase.testclassification.ReplicationTests;
|
|||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.wal.AbstractFSWALProvider;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
|
||||
// TODO: revisit later
|
||||
@Ignore
|
||||
@Category({ ReplicationTests.class, MediumTests.class })
|
||||
public class TestDrainReplicationQueuesForStandBy extends SyncReplicationTestBase {
|
||||
|
||||
|
|
|
@ -36,12 +36,15 @@ import org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper;
|
|||
import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
|
||||
import org.apache.hadoop.hbase.zookeeper.ZNodePaths;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
|
||||
/**
|
||||
* Tests for DumpReplicationQueues tool
|
||||
*/
|
||||
// TODO: reimplement
|
||||
@Ignore
|
||||
@Category({ ReplicationTests.class, SmallTests.class })
|
||||
public class TestDumpReplicationQueues {
|
||||
|
||||
|
|
|
@ -23,7 +23,6 @@ import static org.junit.Assert.assertFalse;
|
|||
import static org.junit.Assert.assertNotNull;
|
||||
import static org.junit.Assert.assertNull;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
import static org.mockito.Mockito.doNothing;
|
||||
import static org.mockito.Mockito.mock;
|
||||
import static org.mockito.Mockito.when;
|
||||
|
||||
|
@ -57,7 +56,8 @@ import org.apache.hadoop.hbase.regionserver.RegionServerServices;
|
|||
import org.apache.hadoop.hbase.replication.ReplicationEndpoint;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeer;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeerConfig;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueData;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueId;
|
||||
import org.apache.hadoop.hbase.replication.WALEntryFilter;
|
||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||
import org.apache.hadoop.hbase.testclassification.ReplicationTests;
|
||||
|
@ -79,6 +79,8 @@ import org.mockito.Mockito;
|
|||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.common.collect.ImmutableMap;
|
||||
|
||||
@Category({ ReplicationTests.class, MediumTests.class })
|
||||
public class TestReplicationSource {
|
||||
|
||||
|
@ -134,11 +136,13 @@ public class TestReplicationSource {
|
|||
ReplicationSourceManager manager = Mockito.mock(ReplicationSourceManager.class);
|
||||
Mockito.when(manager.getGlobalMetrics())
|
||||
.thenReturn(mock(MetricsReplicationGlobalSourceSource.class));
|
||||
String queueId = "qid";
|
||||
|
||||
RegionServerServices rss =
|
||||
TEST_UTIL.createMockRegionServerService(ServerName.parseServerName("a.b.c,1,1"));
|
||||
rs.init(conf, null, manager, null, mockPeer, rss, queueId, null, p -> OptionalLong.empty(),
|
||||
new MetricsSource(queueId));
|
||||
ReplicationQueueId queueId = new ReplicationQueueId(rss.getServerName(), "qid");
|
||||
rs.init(conf, null, manager, null, mockPeer, rss,
|
||||
new ReplicationQueueData(queueId, ImmutableMap.of()), null, p -> OptionalLong.empty(),
|
||||
new MetricsSource(queueId.toString()));
|
||||
try {
|
||||
rs.startup();
|
||||
assertTrue(rs.isSourceActive());
|
||||
|
@ -171,11 +175,12 @@ public class TestReplicationSource {
|
|||
.thenReturn(DoNothingReplicationEndpoint.class.getName());
|
||||
Mockito.when(mockPeer.getPeerConfig()).thenReturn(peerConfig);
|
||||
ReplicationSourceManager manager = Mockito.mock(ReplicationSourceManager.class);
|
||||
String queueId = "qid";
|
||||
RegionServerServices rss =
|
||||
TEST_UTIL.createMockRegionServerService(ServerName.parseServerName("a.b.c,1,1"));
|
||||
rs.init(conf, null, manager, null, mockPeer, rss, queueId, uuid, p -> OptionalLong.empty(),
|
||||
new MetricsSource(queueId));
|
||||
ReplicationQueueId queueId = new ReplicationQueueId(rss.getServerName(), "qid");
|
||||
rs.init(conf, null, manager, null, mockPeer, rss,
|
||||
new ReplicationQueueData(queueId, ImmutableMap.of()), uuid, p -> OptionalLong.empty(),
|
||||
new MetricsSource(queueId.toString()));
|
||||
try {
|
||||
rs.startup();
|
||||
TEST_UTIL.waitFor(30000, () -> rs.getWalEntryFilter() != null);
|
||||
|
@ -257,8 +262,11 @@ public class TestReplicationSource {
|
|||
Configuration testConf = HBaseConfiguration.create();
|
||||
testConf.setInt("replication.source.maxretriesmultiplier", 1);
|
||||
ReplicationSourceManager manager = Mockito.mock(ReplicationSourceManager.class);
|
||||
source.init(testConf, null, manager, null, mockPeer, null, "testPeer", null,
|
||||
p -> OptionalLong.empty(), null);
|
||||
ReplicationQueueId queueId =
|
||||
new ReplicationQueueId(ServerName.valueOf("test,123,123"), "testPeer");
|
||||
source.init(testConf, null, manager, null, mockPeer, null,
|
||||
new ReplicationQueueData(queueId, ImmutableMap.of()), null, p -> OptionalLong.empty(),
|
||||
null);
|
||||
ExecutorService executor = Executors.newSingleThreadExecutor();
|
||||
Future<?> future = executor.submit(() -> source.terminate("testing source termination"));
|
||||
long sleepForRetries = testConf.getLong("replication.source.sleepforretries", 1000);
|
||||
|
@ -276,8 +284,11 @@ public class TestReplicationSource {
|
|||
ReplicationPeer mockPeer = mock(ReplicationPeer.class);
|
||||
Mockito.when(mockPeer.getPeerBandwidth()).thenReturn(0L);
|
||||
Configuration testConf = HBaseConfiguration.create();
|
||||
source.init(testConf, null, mockManager, null, mockPeer, Mockito.mock(Server.class), "testPeer",
|
||||
null, p -> OptionalLong.empty(), mock(MetricsSource.class));
|
||||
ReplicationQueueId queueId =
|
||||
new ReplicationQueueId(ServerName.valueOf("test,123,123"), "testPeer");
|
||||
source.init(testConf, null, mockManager, null, mockPeer, Mockito.mock(Server.class),
|
||||
new ReplicationQueueData(queueId, ImmutableMap.of()), null, p -> OptionalLong.empty(),
|
||||
mock(MetricsSource.class));
|
||||
ReplicationSourceWALReader reader =
|
||||
new ReplicationSourceWALReader(null, conf, null, 0, null, source, null);
|
||||
ReplicationSourceShipper shipper = new ReplicationSourceShipper(conf, null, null, source);
|
||||
|
@ -480,35 +491,6 @@ public class TestReplicationSource {
|
|||
|
||||
}
|
||||
|
||||
/**
|
||||
* Test HBASE-20497 Moved here from TestReplicationSource because doesn't need cluster.
|
||||
*/
|
||||
@Test
|
||||
public void testRecoveredReplicationSourceShipperGetPosition() throws Exception {
|
||||
String walGroupId = "fake-wal-group-id";
|
||||
ServerName serverName = ServerName.valueOf("www.example.com", 12006, 1524679704418L);
|
||||
ServerName deadServer = ServerName.valueOf("www.deadServer.com", 12006, 1524679704419L);
|
||||
RecoveredReplicationSource source = mock(RecoveredReplicationSource.class);
|
||||
Server server = mock(Server.class);
|
||||
Mockito.when(server.getServerName()).thenReturn(serverName);
|
||||
Mockito.when(source.getServer()).thenReturn(server);
|
||||
Mockito.when(source.getServerWALsBelongTo()).thenReturn(deadServer);
|
||||
ReplicationQueueStorage storage = mock(ReplicationQueueStorage.class);
|
||||
Mockito.when(storage.getWALPosition(Mockito.eq(serverName), Mockito.any(), Mockito.any()))
|
||||
.thenReturn(1001L);
|
||||
Mockito.when(storage.getWALPosition(Mockito.eq(deadServer), Mockito.any(), Mockito.any()))
|
||||
.thenReturn(-1L);
|
||||
Configuration conf = new Configuration(TEST_UTIL.getConfiguration());
|
||||
conf.setInt("replication.source.maxretriesmultiplier", -1);
|
||||
MetricsSource metricsSource = mock(MetricsSource.class);
|
||||
doNothing().when(metricsSource).incrSizeOfLogQueue();
|
||||
ReplicationSourceLogQueue logQueue = new ReplicationSourceLogQueue(conf, metricsSource, source);
|
||||
logQueue.enqueueLog(new Path("/www/html/test"), walGroupId);
|
||||
RecoveredReplicationSourceShipper shipper =
|
||||
new RecoveredReplicationSourceShipper(conf, walGroupId, logQueue, source, storage);
|
||||
assertEquals(1001L, shipper.getStartPosition());
|
||||
}
|
||||
|
||||
private RegionServerServices setupForAbortTests(ReplicationSource rs, Configuration conf,
|
||||
String endpointName) throws IOException {
|
||||
conf.setInt("replication.source.maxretriesmultiplier", 1);
|
||||
|
@ -522,11 +504,12 @@ public class TestReplicationSource {
|
|||
ReplicationSourceManager manager = Mockito.mock(ReplicationSourceManager.class);
|
||||
Mockito.when(manager.getGlobalMetrics())
|
||||
.thenReturn(mock(MetricsReplicationGlobalSourceSource.class));
|
||||
String queueId = "qid";
|
||||
RegionServerServices rss =
|
||||
TEST_UTIL.createMockRegionServerService(ServerName.parseServerName("a.b.c,1,1"));
|
||||
rs.init(conf, null, manager, null, mockPeer, rss, queueId, null, p -> OptionalLong.empty(),
|
||||
new MetricsSource(queueId));
|
||||
ReplicationQueueId queueId = new ReplicationQueueId(rss.getServerName(), "qid");
|
||||
rs.init(conf, null, manager, null, mockPeer, rss,
|
||||
new ReplicationQueueData(queueId, ImmutableMap.of()), null, p -> OptionalLong.empty(),
|
||||
new MetricsSource(queueId.toString()));
|
||||
return rss;
|
||||
}
|
||||
|
||||
|
@ -624,8 +607,8 @@ public class TestReplicationSource {
|
|||
ManualEnvironmentEdge manualEdge = new ManualEnvironmentEdge();
|
||||
EnvironmentEdgeManager.injectEdge(manualEdge);
|
||||
|
||||
String id = "1";
|
||||
MetricsSource metrics = new MetricsSource(id);
|
||||
String peerId = "1";
|
||||
MetricsSource metrics = new MetricsSource(peerId);
|
||||
Configuration conf = new Configuration(TEST_UTIL.getConfiguration());
|
||||
conf.setInt("replication.source.maxretriesmultiplier", 1);
|
||||
ReplicationPeer mockPeer = Mockito.mock(ReplicationPeer.class);
|
||||
|
@ -640,16 +623,17 @@ public class TestReplicationSource {
|
|||
.thenReturn(mock(MetricsReplicationGlobalSourceSource.class));
|
||||
RegionServerServices rss =
|
||||
TEST_UTIL.createMockRegionServerService(ServerName.parseServerName("a.b.c,1,1"));
|
||||
|
||||
ReplicationQueueId queueId = new ReplicationQueueId(rss.getServerName(), peerId);
|
||||
ReplicationSource source = new ReplicationSource();
|
||||
source.init(conf, null, manager, null, mockPeer, rss, id, null, p -> OptionalLong.empty(),
|
||||
source.init(conf, null, manager, null, mockPeer, rss,
|
||||
new ReplicationQueueData(queueId, ImmutableMap.of()), null, p -> OptionalLong.empty(),
|
||||
metrics);
|
||||
|
||||
final Path log1 = new Path(logDir, "log-walgroup-a.8");
|
||||
manualEdge.setValue(10);
|
||||
// Diff of current time (10) and log-walgroup-a.8 timestamp will be 2.
|
||||
source.enqueueLog(log1);
|
||||
MetricsReplicationSourceSource metricsSource1 = getSourceMetrics(id);
|
||||
MetricsReplicationSourceSource metricsSource1 = getSourceMetrics(peerId);
|
||||
assertEquals(2, metricsSource1.getOldestWalAge());
|
||||
|
||||
final Path log2 = new Path(logDir, "log-walgroup-b.4");
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1,107 +0,0 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.replication.regionserver;
|
||||
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.util.List;
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseConfiguration;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtil;
|
||||
import org.apache.hadoop.hbase.Server;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueInfo;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationSourceDummy;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationStorageFactory;
|
||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||
import org.apache.hadoop.hbase.testclassification.ReplicationTests;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
|
||||
/**
|
||||
* Tests the ReplicationSourceManager with ReplicationQueueZkImpl's and
|
||||
* ReplicationQueuesClientZkImpl. Also includes extra tests outside of those in
|
||||
* TestReplicationSourceManager that test ReplicationQueueZkImpl-specific behaviors.
|
||||
*/
|
||||
@Category({ ReplicationTests.class, MediumTests.class })
|
||||
public class TestReplicationSourceManagerZkImpl extends TestReplicationSourceManager {
|
||||
|
||||
@ClassRule
|
||||
public static final HBaseClassTestRule CLASS_RULE =
|
||||
HBaseClassTestRule.forClass(TestReplicationSourceManagerZkImpl.class);
|
||||
|
||||
@BeforeClass
|
||||
public static void setUpBeforeClass() throws Exception {
|
||||
conf = HBaseConfiguration.create();
|
||||
conf.set("replication.replicationsource.implementation",
|
||||
ReplicationSourceDummy.class.getCanonicalName());
|
||||
conf.setLong("replication.sleep.before.failover", 2000);
|
||||
conf.setInt("replication.source.maxretriesmultiplier", 10);
|
||||
utility = new HBaseTestingUtil(conf);
|
||||
utility.startMiniZKCluster();
|
||||
setupZkAndReplication();
|
||||
}
|
||||
|
||||
// Tests the naming convention of adopted queues for ReplicationQueuesZkImpl
|
||||
@Test
|
||||
public void testNodeFailoverDeadServerParsing() throws Exception {
|
||||
Server server = new DummyServer("ec2-54-234-230-108.compute-1.amazonaws.com");
|
||||
ReplicationQueueStorage queueStorage =
|
||||
ReplicationStorageFactory.getReplicationQueueStorage(server.getZooKeeper(), conf);
|
||||
// populate some znodes in the peer znode
|
||||
files.add("log1");
|
||||
files.add("log2");
|
||||
for (String file : files) {
|
||||
queueStorage.addWAL(server.getServerName(), "1", file);
|
||||
}
|
||||
|
||||
// create 3 DummyServers
|
||||
Server s1 = new DummyServer("ip-10-8-101-114.ec2.internal");
|
||||
Server s2 = new DummyServer("ec2-107-20-52-47.compute-1.amazonaws.com");
|
||||
Server s3 = new DummyServer("ec2-23-20-187-167.compute-1.amazonaws.com");
|
||||
|
||||
// simulate three servers fail sequentially
|
||||
ServerName serverName = server.getServerName();
|
||||
List<String> unclaimed = queueStorage.getAllQueues(serverName);
|
||||
queueStorage.claimQueue(serverName, unclaimed.get(0), s1.getServerName());
|
||||
queueStorage.removeReplicatorIfQueueIsEmpty(serverName);
|
||||
|
||||
serverName = s1.getServerName();
|
||||
unclaimed = queueStorage.getAllQueues(serverName);
|
||||
queueStorage.claimQueue(serverName, unclaimed.get(0), s2.getServerName());
|
||||
queueStorage.removeReplicatorIfQueueIsEmpty(serverName);
|
||||
|
||||
serverName = s2.getServerName();
|
||||
unclaimed = queueStorage.getAllQueues(serverName);
|
||||
String queue3 =
|
||||
queueStorage.claimQueue(serverName, unclaimed.get(0), s3.getServerName()).getFirst();
|
||||
queueStorage.removeReplicatorIfQueueIsEmpty(serverName);
|
||||
|
||||
ReplicationQueueInfo replicationQueueInfo = new ReplicationQueueInfo(queue3);
|
||||
List<ServerName> result = replicationQueueInfo.getDeadRegionServers();
|
||||
// verify
|
||||
assertTrue(result.contains(server.getServerName()));
|
||||
assertTrue(result.contains(s1.getServerName()));
|
||||
assertTrue(result.contains(s2.getServerName()));
|
||||
|
||||
server.stop("");
|
||||
}
|
||||
}
|
|
@ -0,0 +1,147 @@
|
|||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.replication.regionserver;
|
||||
|
||||
import static org.junit.Assert.assertFalse;
|
||||
import static org.junit.Assert.assertNull;
|
||||
import static org.junit.Assert.assertTrue;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.NavigableMap;
|
||||
import java.util.TreeMap;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseConfiguration;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||
import org.apache.hadoop.hbase.client.RegionInfoBuilder;
|
||||
import org.apache.hadoop.hbase.testclassification.ReplicationTests;
|
||||
import org.apache.hadoop.hbase.testclassification.SmallTests;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.wal.WALEdit;
|
||||
import org.apache.hadoop.hbase.wal.WALKeyImpl;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.protobuf.UnsafeByteOperations;
|
||||
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos;
|
||||
import org.apache.hadoop.hbase.shaded.protobuf.generated.WALProtos.BulkLoadDescriptor;
|
||||
|
||||
@Category({ ReplicationTests.class, SmallTests.class })
|
||||
public class TestReplicationWALEdits {
|
||||
|
||||
@ClassRule
|
||||
public static final HBaseClassTestRule CLASS_RULE =
|
||||
HBaseClassTestRule.forClass(TestReplicationWALEdits.class);
|
||||
|
||||
private static final Configuration CONF = HBaseConfiguration.create();
|
||||
|
||||
private static final TableName TABLE_NAME = TableName.valueOf("test");
|
||||
|
||||
private static final byte[] F1 = Bytes.toBytes("f1");
|
||||
|
||||
private static final byte[] F2 = Bytes.toBytes("f2");
|
||||
|
||||
private static final RegionInfo RI = RegionInfoBuilder.newBuilder(TABLE_NAME).build();
|
||||
|
||||
/**
|
||||
* Test for HBASE-9038, Replication.scopeWALEdits would NPE if it wasn't filtering out the
|
||||
* compaction WALEdit.
|
||||
*/
|
||||
@Test
|
||||
public void testCompactionWALEdits() throws Exception {
|
||||
TableName tableName = TableName.valueOf("testCompactionWALEdits");
|
||||
WALProtos.CompactionDescriptor compactionDescriptor =
|
||||
WALProtos.CompactionDescriptor.getDefaultInstance();
|
||||
RegionInfo hri = RegionInfoBuilder.newBuilder(tableName).setStartKey(HConstants.EMPTY_START_ROW)
|
||||
.setEndKey(HConstants.EMPTY_END_ROW).build();
|
||||
WALEdit edit = WALEdit.createCompaction(hri, compactionDescriptor);
|
||||
ReplicationSourceWALActionListener.scopeWALEdits(new WALKeyImpl(), edit, CONF);
|
||||
}
|
||||
|
||||
private WALEdit getBulkLoadWALEdit(NavigableMap<byte[], Integer> scope) {
|
||||
// 1. Create store files for the families
|
||||
Map<byte[], List<Path>> storeFiles = new HashMap<>(1);
|
||||
Map<String, Long> storeFilesSize = new HashMap<>(1);
|
||||
List<Path> p = new ArrayList<>(1);
|
||||
Path hfilePath1 = new Path(Bytes.toString(F1));
|
||||
p.add(hfilePath1);
|
||||
storeFilesSize.put(hfilePath1.getName(), 0L);
|
||||
storeFiles.put(F1, p);
|
||||
scope.put(F1, 1);
|
||||
p = new ArrayList<>(1);
|
||||
Path hfilePath2 = new Path(Bytes.toString(F2));
|
||||
p.add(hfilePath2);
|
||||
storeFilesSize.put(hfilePath2.getName(), 0L);
|
||||
storeFiles.put(F2, p);
|
||||
// 2. Create bulk load descriptor
|
||||
BulkLoadDescriptor desc = ProtobufUtil.toBulkLoadDescriptor(RI.getTable(),
|
||||
UnsafeByteOperations.unsafeWrap(RI.getEncodedNameAsBytes()), storeFiles, storeFilesSize, 1);
|
||||
|
||||
// 3. create bulk load wal edit event
|
||||
WALEdit logEdit = WALEdit.createBulkLoadEvent(RI, desc);
|
||||
return logEdit;
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBulkLoadWALEditsWithoutBulkLoadReplicationEnabled() throws Exception {
|
||||
NavigableMap<byte[], Integer> scope = new TreeMap<>(Bytes.BYTES_COMPARATOR);
|
||||
// 1. Get the bulk load wal edit event
|
||||
WALEdit logEdit = getBulkLoadWALEdit(scope);
|
||||
// 2. Create wal key
|
||||
WALKeyImpl logKey = new WALKeyImpl(scope);
|
||||
|
||||
// 3. Get the scopes for the key
|
||||
ReplicationSourceWALActionListener.scopeWALEdits(logKey, logEdit, CONF);
|
||||
|
||||
// 4. Assert that no bulk load entry scopes are added if bulk load hfile replication is disabled
|
||||
assertNull("No bulk load entries scope should be added if bulk load replication is disabled.",
|
||||
logKey.getReplicationScopes());
|
||||
}
|
||||
|
||||
@Test
|
||||
public void testBulkLoadWALEdits() throws Exception {
|
||||
// 1. Get the bulk load wal edit event
|
||||
NavigableMap<byte[], Integer> scope = new TreeMap<>(Bytes.BYTES_COMPARATOR);
|
||||
WALEdit logEdit = getBulkLoadWALEdit(scope);
|
||||
// 2. Create wal key
|
||||
WALKeyImpl logKey = new WALKeyImpl(scope);
|
||||
// 3. Enable bulk load hfile replication
|
||||
Configuration bulkLoadConf = HBaseConfiguration.create(CONF);
|
||||
bulkLoadConf.setBoolean(HConstants.REPLICATION_BULKLOAD_ENABLE_KEY, true);
|
||||
|
||||
// 4. Get the scopes for the key
|
||||
ReplicationSourceWALActionListener.scopeWALEdits(logKey, logEdit, bulkLoadConf);
|
||||
|
||||
NavigableMap<byte[], Integer> scopes = logKey.getReplicationScopes();
|
||||
// Assert family with replication scope global is present in the key scopes
|
||||
assertTrue("This family scope is set to global, should be part of replication key scopes.",
|
||||
scopes.containsKey(F1));
|
||||
// Assert family with replication scope local is not present in the key scopes
|
||||
assertFalse("This family scope is set to local, should not be part of replication key scopes",
|
||||
scopes.containsKey(F2));
|
||||
}
|
||||
}
|
|
@ -46,6 +46,8 @@ import org.apache.hadoop.hbase.client.Table;
|
|||
import org.apache.hadoop.hbase.master.RegionState;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationBarrierFamilyFormat;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationException;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationGroupOffset;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueId;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationStorageFactory;
|
||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||
|
@ -94,10 +96,11 @@ public class TestSerialReplicationChecker {
|
|||
@BeforeClass
|
||||
public static void setUpBeforeClass() throws Exception {
|
||||
UTIL.startMiniCluster(1);
|
||||
QUEUE_STORAGE = ReplicationStorageFactory.getReplicationQueueStorage(UTIL.getZooKeeperWatcher(),
|
||||
UTIL.getConfiguration());
|
||||
QUEUE_STORAGE.addWAL(UTIL.getMiniHBaseCluster().getRegionServer(0).getServerName(), PEER_ID,
|
||||
WAL_FILE_NAME);
|
||||
TableName repTable = TableName.valueOf("test_serial_rep");
|
||||
UTIL.getAdmin()
|
||||
.createTable(ReplicationStorageFactory.createReplicationQueueTableDescriptor(repTable));
|
||||
QUEUE_STORAGE =
|
||||
ReplicationStorageFactory.getReplicationQueueStorage(UTIL.getConnection(), repTable);
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
|
@ -174,8 +177,10 @@ public class TestSerialReplicationChecker {
|
|||
}
|
||||
|
||||
private void updatePushedSeqId(RegionInfo region, long seqId) throws ReplicationException {
|
||||
QUEUE_STORAGE.setWALPosition(UTIL.getMiniHBaseCluster().getRegionServer(0).getServerName(),
|
||||
PEER_ID, WAL_FILE_NAME, 10, ImmutableMap.of(region.getEncodedName(), seqId));
|
||||
ReplicationQueueId queueId = new ReplicationQueueId(
|
||||
UTIL.getMiniHBaseCluster().getRegionServer(0).getServerName(), PEER_ID);
|
||||
QUEUE_STORAGE.setOffset(queueId, "", new ReplicationGroupOffset(WAL_FILE_NAME, 10),
|
||||
ImmutableMap.of(region.getEncodedName(), seqId));
|
||||
}
|
||||
|
||||
private void addParents(RegionInfo region, List<RegionInfo> parents) throws IOException {
|
||||
|
|
|
@ -25,7 +25,6 @@ import java.net.SocketAddress;
|
|||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hbase.Cell;
|
||||
|
@ -35,7 +34,6 @@ import org.apache.hadoop.hbase.CellBuilderType;
|
|||
import org.apache.hadoop.hbase.CellScanner;
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseConfiguration;
|
||||
import org.apache.hadoop.hbase.Stoppable;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.AdvancedScanResultConsumer;
|
||||
import org.apache.hadoop.hbase.client.AsyncClusterConnection;
|
||||
|
@ -55,8 +53,6 @@ import org.junit.Rule;
|
|||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
import org.junit.rules.TestName;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.protobuf.ByteString;
|
||||
|
||||
|
@ -72,31 +68,12 @@ public class TestWALEntrySinkFilter {
|
|||
public static final HBaseClassTestRule CLASS_RULE =
|
||||
HBaseClassTestRule.forClass(TestWALEntrySinkFilter.class);
|
||||
|
||||
private static final Logger LOG = LoggerFactory.getLogger(TestReplicationSink.class);
|
||||
@Rule
|
||||
public TestName name = new TestName();
|
||||
static final int BOUNDARY = 5;
|
||||
static final AtomicInteger UNFILTERED = new AtomicInteger();
|
||||
static final AtomicInteger FILTERED = new AtomicInteger();
|
||||
|
||||
/**
|
||||
* Implemetentation of Stoppable to pass into ReplicationSink.
|
||||
*/
|
||||
private static Stoppable STOPPABLE = new Stoppable() {
|
||||
private final AtomicBoolean stop = new AtomicBoolean(false);
|
||||
|
||||
@Override
|
||||
public boolean isStopped() {
|
||||
return this.stop.get();
|
||||
}
|
||||
|
||||
@Override
|
||||
public void stop(String why) {
|
||||
LOG.info("STOPPING BECAUSE: " + why);
|
||||
this.stop.set(true);
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Test filter. Filter will filter out any write time that is <= 5 (BOUNDARY). We count how many
|
||||
* items we filter out and we count how many cells make it through for distribution way down below
|
||||
|
|
|
@ -29,6 +29,7 @@ import org.apache.hadoop.hbase.HBaseClassTestRule;
|
|||
import org.apache.hadoop.hbase.HBaseTestingUtil;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.MetaTableAccessor;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptorBuilder;
|
||||
import org.apache.hadoop.hbase.client.Put;
|
||||
|
@ -43,7 +44,9 @@ import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
|
|||
import org.apache.hadoop.hbase.master.RegionState;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationBarrierFamilyFormat;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationException;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationGroupOffset;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeerConfig;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueId;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationStorageFactory;
|
||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||
|
@ -78,13 +81,9 @@ public class TestHBaseFsckCleanReplicationBarriers {
|
|||
@BeforeClass
|
||||
public static void setUp() throws Exception {
|
||||
UTIL.startMiniCluster(1);
|
||||
QUEUE_STORAGE = ReplicationStorageFactory.getReplicationQueueStorage(UTIL.getZooKeeperWatcher(),
|
||||
QUEUE_STORAGE = ReplicationStorageFactory.getReplicationQueueStorage(UTIL.getConnection(),
|
||||
UTIL.getConfiguration());
|
||||
createPeer();
|
||||
QUEUE_STORAGE.addWAL(UTIL.getMiniHBaseCluster().getRegionServer(0).getServerName(), PEER_1,
|
||||
WAL_FILE_NAME);
|
||||
QUEUE_STORAGE.addWAL(UTIL.getMiniHBaseCluster().getRegionServer(0).getServerName(), PEER_2,
|
||||
WAL_FILE_NAME);
|
||||
}
|
||||
|
||||
@AfterClass
|
||||
|
@ -205,9 +204,12 @@ public class TestHBaseFsckCleanReplicationBarriers {
|
|||
}
|
||||
|
||||
private void updatePushedSeqId(RegionInfo region, long seqId) throws ReplicationException {
|
||||
QUEUE_STORAGE.setWALPosition(UTIL.getMiniHBaseCluster().getRegionServer(0).getServerName(),
|
||||
PEER_1, WAL_FILE_NAME, 10, ImmutableMap.of(region.getEncodedName(), seqId));
|
||||
QUEUE_STORAGE.setWALPosition(UTIL.getMiniHBaseCluster().getRegionServer(0).getServerName(),
|
||||
PEER_2, WAL_FILE_NAME, 10, ImmutableMap.of(region.getEncodedName(), seqId));
|
||||
ServerName sn = UTIL.getMiniHBaseCluster().getRegionServer(0).getServerName();
|
||||
QUEUE_STORAGE.setOffset(new ReplicationQueueId(sn, PEER_1), "",
|
||||
new ReplicationGroupOffset(WAL_FILE_NAME, 10),
|
||||
ImmutableMap.of(region.getEncodedName(), seqId));
|
||||
QUEUE_STORAGE.setOffset(new ReplicationQueueId(sn, PEER_2), "",
|
||||
new ReplicationGroupOffset(WAL_FILE_NAME, 10),
|
||||
ImmutableMap.of(region.getEncodedName(), seqId));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -55,9 +55,12 @@ import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil;
|
|||
import org.junit.After;
|
||||
import org.junit.Before;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
|
||||
//revisit later
|
||||
@Ignore
|
||||
@Category({ MiscTests.class, MediumTests.class })
|
||||
public class TestHBaseFsckEncryption {
|
||||
|
||||
|
|
|
@ -40,11 +40,14 @@ import org.junit.AfterClass;
|
|||
import org.junit.Before;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
|
||||
import org.apache.hbase.thirdparty.com.google.common.util.concurrent.ThreadFactoryBuilder;
|
||||
|
||||
// revisit later
|
||||
@Ignore
|
||||
@Category({ MiscTests.class, MediumTests.class })
|
||||
public class TestHBaseFsckMOB extends BaseTestHBaseFsck {
|
||||
|
||||
|
|
|
@ -17,25 +17,14 @@
|
|||
*/
|
||||
package org.apache.hadoop.hbase.util;
|
||||
|
||||
import static org.junit.Assert.assertEquals;
|
||||
|
||||
import java.util.List;
|
||||
import java.util.stream.Stream;
|
||||
import org.apache.hadoop.hbase.HBaseClassTestRule;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtil;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeerConfig;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationPeerStorage;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationQueueStorage;
|
||||
import org.apache.hadoop.hbase.replication.ReplicationStorageFactory;
|
||||
import org.apache.hadoop.hbase.replication.SyncReplicationState;
|
||||
import org.apache.hadoop.hbase.testclassification.MediumTests;
|
||||
import org.apache.hadoop.hbase.testclassification.ReplicationTests;
|
||||
import org.apache.hadoop.hbase.util.HbckErrorReporter.ERROR_CODE;
|
||||
import org.apache.hadoop.hbase.util.hbck.HbckTestingUtil;
|
||||
import org.junit.AfterClass;
|
||||
import org.junit.BeforeClass;
|
||||
import org.junit.ClassRule;
|
||||
import org.junit.Ignore;
|
||||
import org.junit.Test;
|
||||
import org.junit.experimental.categories.Category;
|
||||
|
||||
|
@ -59,51 +48,53 @@ public class TestHBaseFsckReplication {
|
|||
UTIL.shutdownMiniCluster();
|
||||
}
|
||||
|
||||
// TODO: reimplement
|
||||
@Ignore
|
||||
@Test
|
||||
public void test() throws Exception {
|
||||
ReplicationPeerStorage peerStorage = ReplicationStorageFactory.getReplicationPeerStorage(
|
||||
UTIL.getTestFileSystem(), UTIL.getZooKeeperWatcher(), UTIL.getConfiguration());
|
||||
ReplicationQueueStorage queueStorage = ReplicationStorageFactory
|
||||
.getReplicationQueueStorage(UTIL.getZooKeeperWatcher(), UTIL.getConfiguration());
|
||||
|
||||
String peerId1 = "1";
|
||||
String peerId2 = "2";
|
||||
peerStorage.addPeer(peerId1, ReplicationPeerConfig.newBuilder().setClusterKey("key").build(),
|
||||
true, SyncReplicationState.NONE);
|
||||
peerStorage.addPeer(peerId2, ReplicationPeerConfig.newBuilder().setClusterKey("key").build(),
|
||||
true, SyncReplicationState.NONE);
|
||||
for (int i = 0; i < 10; i++) {
|
||||
queueStorage.addWAL(ServerName.valueOf("localhost", 10000 + i, 100000 + i), peerId1,
|
||||
"file-" + i);
|
||||
}
|
||||
queueStorage.addWAL(ServerName.valueOf("localhost", 10000, 100000), peerId2, "file");
|
||||
HBaseFsck fsck = HbckTestingUtil.doFsck(UTIL.getConfiguration(), true);
|
||||
HbckTestingUtil.assertNoErrors(fsck);
|
||||
|
||||
// should not remove anything since the replication peer is still alive
|
||||
assertEquals(10, queueStorage.getListOfReplicators().size());
|
||||
peerStorage.removePeer(peerId1);
|
||||
// there should be orphan queues
|
||||
assertEquals(10, queueStorage.getListOfReplicators().size());
|
||||
fsck = HbckTestingUtil.doFsck(UTIL.getConfiguration(), false);
|
||||
HbckTestingUtil.assertErrors(fsck, Stream.generate(() -> {
|
||||
return ERROR_CODE.UNDELETED_REPLICATION_QUEUE;
|
||||
}).limit(10).toArray(ERROR_CODE[]::new));
|
||||
|
||||
// should not delete anything when fix is false
|
||||
assertEquals(10, queueStorage.getListOfReplicators().size());
|
||||
|
||||
fsck = HbckTestingUtil.doFsck(UTIL.getConfiguration(), true);
|
||||
HbckTestingUtil.assertErrors(fsck, Stream.generate(() -> {
|
||||
return ERROR_CODE.UNDELETED_REPLICATION_QUEUE;
|
||||
}).limit(10).toArray(ERROR_CODE[]::new));
|
||||
|
||||
List<ServerName> replicators = queueStorage.getListOfReplicators();
|
||||
// should not remove the server with queue for peerId2
|
||||
assertEquals(1, replicators.size());
|
||||
assertEquals(ServerName.valueOf("localhost", 10000, 100000), replicators.get(0));
|
||||
for (String queueId : queueStorage.getAllQueues(replicators.get(0))) {
|
||||
assertEquals(peerId2, queueId);
|
||||
}
|
||||
// ReplicationPeerStorage peerStorage = ReplicationStorageFactory
|
||||
// .getReplicationPeerStorage(UTIL.getZooKeeperWatcher(), UTIL.getConfiguration());
|
||||
// ReplicationQueueStorage queueStorage = ReplicationStorageFactory
|
||||
// .getReplicationQueueStorage(UTIL.getZooKeeperWatcher(), UTIL.getConfiguration());
|
||||
//
|
||||
// String peerId1 = "1";
|
||||
// String peerId2 = "2";
|
||||
// peerStorage.addPeer(peerId1, ReplicationPeerConfig.newBuilder().setClusterKey("key").build(),
|
||||
// true, SyncReplicationState.NONE);
|
||||
// peerStorage.addPeer(peerId2, ReplicationPeerConfig.newBuilder().setClusterKey("key").build(),
|
||||
// true, SyncReplicationState.NONE);
|
||||
// for (int i = 0; i < 10; i++) {
|
||||
// queueStorage.addWAL(ServerName.valueOf("localhost", 10000 + i, 100000 + i), peerId1,
|
||||
// "file-" + i);
|
||||
// }
|
||||
// queueStorage.addWAL(ServerName.valueOf("localhost", 10000, 100000), peerId2, "file");
|
||||
// HBaseFsck fsck = HbckTestingUtil.doFsck(UTIL.getConfiguration(), true);
|
||||
// HbckTestingUtil.assertNoErrors(fsck);
|
||||
//
|
||||
// // should not remove anything since the replication peer is still alive
|
||||
// assertEquals(10, queueStorage.getListOfReplicators().size());
|
||||
// peerStorage.removePeer(peerId1);
|
||||
// // there should be orphan queues
|
||||
// assertEquals(10, queueStorage.getListOfReplicators().size());
|
||||
// fsck = HbckTestingUtil.doFsck(UTIL.getConfiguration(), false);
|
||||
// HbckTestingUtil.assertErrors(fsck, Stream.generate(() -> {
|
||||
// return ERROR_CODE.UNDELETED_REPLICATION_QUEUE;
|
||||
// }).limit(10).toArray(ERROR_CODE[]::new));
|
||||
//
|
||||
// // should not delete anything when fix is false
|
||||
// assertEquals(10, queueStorage.getListOfReplicators().size());
|
||||
//
|
||||
// fsck = HbckTestingUtil.doFsck(UTIL.getConfiguration(), true);
|
||||
// HbckTestingUtil.assertErrors(fsck, Stream.generate(() -> {
|
||||
// return ERROR_CODE.UNDELETED_REPLICATION_QUEUE;
|
||||
// }).limit(10).toArray(ERROR_CODE[]::new));
|
||||
//
|
||||
// List<ServerName> replicators = queueStorage.getListOfReplicators();
|
||||
// // should not remove the server with queue for peerId2
|
||||
// assertEquals(1, replicators.size());
|
||||
// assertEquals(ServerName.valueOf("localhost", 10000, 100000), replicators.get(0));
|
||||
// for (String queueId : queueStorage.getAllQueues(replicators.get(0))) {
|
||||
// assertEquals(peerId2, queueId);
|
||||
// }
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue