HBASE-19144 [RSgroups] Retry assignments in FAILED_OPEN state when servers (re)join the cluster
This commit is contained in:
parent
b9b0f15cdb
commit
5df9651581
|
@ -36,6 +36,10 @@ import org.apache.hadoop.hbase.net.Address;
|
||||||
*/
|
*/
|
||||||
@InterfaceAudience.Private
|
@InterfaceAudience.Private
|
||||||
public interface RSGroupInfoManager {
|
public interface RSGroupInfoManager {
|
||||||
|
|
||||||
|
String REASSIGN_WAIT_INTERVAL_KEY = "hbase.rsgroup.reassign.wait";
|
||||||
|
long DEFAULT_REASSIGN_WAIT_INTERVAL = 30 * 1000L;
|
||||||
|
|
||||||
//Assigned before user tables
|
//Assigned before user tables
|
||||||
TableName RSGROUP_TABLE_NAME =
|
TableName RSGROUP_TABLE_NAME =
|
||||||
TableName.valueOf(NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR, "rsgroup");
|
TableName.valueOf(NamespaceDescriptor.SYSTEM_NAMESPACE_NAME_STR, "rsgroup");
|
||||||
|
|
|
@ -36,6 +36,7 @@ import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
|
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
|
import org.apache.hadoop.conf.Configuration;
|
||||||
import org.apache.hadoop.hbase.Cell;
|
import org.apache.hadoop.hbase.Cell;
|
||||||
import org.apache.hadoop.hbase.CellUtil;
|
import org.apache.hadoop.hbase.CellUtil;
|
||||||
import org.apache.hadoop.hbase.Coprocessor;
|
import org.apache.hadoop.hbase.Coprocessor;
|
||||||
|
@ -65,6 +66,7 @@ import org.apache.hadoop.hbase.ipc.CoprocessorRpcChannel;
|
||||||
import org.apache.hadoop.hbase.master.MasterServices;
|
import org.apache.hadoop.hbase.master.MasterServices;
|
||||||
import org.apache.hadoop.hbase.master.ServerListener;
|
import org.apache.hadoop.hbase.master.ServerListener;
|
||||||
import org.apache.hadoop.hbase.master.TableStateManager;
|
import org.apache.hadoop.hbase.master.TableStateManager;
|
||||||
|
import org.apache.hadoop.hbase.master.assignment.RegionStates.RegionStateNode;
|
||||||
import org.apache.hadoop.hbase.net.Address;
|
import org.apache.hadoop.hbase.net.Address;
|
||||||
import org.apache.hadoop.hbase.procedure2.Procedure;
|
import org.apache.hadoop.hbase.procedure2.Procedure;
|
||||||
import org.apache.hadoop.hbase.protobuf.ProtobufMagic;
|
import org.apache.hadoop.hbase.protobuf.ProtobufMagic;
|
||||||
|
@ -144,6 +146,7 @@ class RSGroupInfoManagerImpl implements RSGroupInfoManager {
|
||||||
private Set<String> prevRSGroups = new HashSet<>();
|
private Set<String> prevRSGroups = new HashSet<>();
|
||||||
private final ServerEventsListenerThread serverEventsListenerThread =
|
private final ServerEventsListenerThread serverEventsListenerThread =
|
||||||
new ServerEventsListenerThread();
|
new ServerEventsListenerThread();
|
||||||
|
private FailedOpenUpdaterThread failedOpenUpdaterThread;
|
||||||
|
|
||||||
private RSGroupInfoManagerImpl(MasterServices masterServices) throws IOException {
|
private RSGroupInfoManagerImpl(MasterServices masterServices) throws IOException {
|
||||||
this.masterServices = masterServices;
|
this.masterServices = masterServices;
|
||||||
|
@ -156,6 +159,9 @@ class RSGroupInfoManagerImpl implements RSGroupInfoManager {
|
||||||
rsGroupStartupWorker.start();
|
rsGroupStartupWorker.start();
|
||||||
serverEventsListenerThread.start();
|
serverEventsListenerThread.start();
|
||||||
masterServices.getServerManager().registerListener(serverEventsListenerThread);
|
masterServices.getServerManager().registerListener(serverEventsListenerThread);
|
||||||
|
failedOpenUpdaterThread = new FailedOpenUpdaterThread(masterServices.getConfiguration());
|
||||||
|
failedOpenUpdaterThread.start();
|
||||||
|
masterServices.getServerManager().registerListener(failedOpenUpdaterThread);
|
||||||
}
|
}
|
||||||
|
|
||||||
static RSGroupInfoManager getInstance(MasterServices master) throws IOException {
|
static RSGroupInfoManager getInstance(MasterServices master) throws IOException {
|
||||||
|
@ -564,6 +570,26 @@ class RSGroupInfoManagerImpl implements RSGroupInfoManager {
|
||||||
flushConfig(newGroupMap);
|
flushConfig(newGroupMap);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Called by FailedOpenUpdaterThread
|
||||||
|
private void updateFailedAssignments() {
|
||||||
|
// Kick all regions in FAILED_OPEN state
|
||||||
|
List<RegionInfo> stuckAssignments = Lists.newArrayList();
|
||||||
|
for (RegionStateNode state:
|
||||||
|
masterServices.getAssignmentManager().getRegionStates().getRegionsInTransition()) {
|
||||||
|
if (state.isStuck()) {
|
||||||
|
stuckAssignments.add(state.getRegionInfo());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (RegionInfo region: stuckAssignments) {
|
||||||
|
LOG.info("Retrying assignment of " + region);
|
||||||
|
try {
|
||||||
|
masterServices.getAssignmentManager().unassign(region);
|
||||||
|
} catch (IOException e) {
|
||||||
|
LOG.warn("Unable to reassign " + region, e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Calls {@link RSGroupInfoManagerImpl#updateDefaultServers(SortedSet)} to update list of known
|
* Calls {@link RSGroupInfoManagerImpl#updateDefaultServers(SortedSet)} to update list of known
|
||||||
* servers. Notifications about server changes are received by registering {@link ServerListener}.
|
* servers. Notifications about server changes are received by registering {@link ServerListener}.
|
||||||
|
@ -608,7 +634,7 @@ class RSGroupInfoManagerImpl implements RSGroupInfoManager {
|
||||||
}
|
}
|
||||||
try {
|
try {
|
||||||
synchronized (this) {
|
synchronized (this) {
|
||||||
if(!changed) {
|
while (!changed) {
|
||||||
wait();
|
wait();
|
||||||
}
|
}
|
||||||
changed = false;
|
changed = false;
|
||||||
|
@ -623,6 +649,67 @@ class RSGroupInfoManagerImpl implements RSGroupInfoManager {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private class FailedOpenUpdaterThread extends Thread implements ServerListener {
|
||||||
|
private final long waitInterval;
|
||||||
|
private volatile boolean hasChanged = false;
|
||||||
|
|
||||||
|
public FailedOpenUpdaterThread(Configuration conf) {
|
||||||
|
this.waitInterval = conf.getLong(REASSIGN_WAIT_INTERVAL_KEY,
|
||||||
|
DEFAULT_REASSIGN_WAIT_INTERVAL);
|
||||||
|
setDaemon(true);
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void serverAdded(ServerName serverName) {
|
||||||
|
serverChanged();
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void serverRemoved(ServerName serverName) {
|
||||||
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
|
public void run() {
|
||||||
|
while (isMasterRunning(masterServices)) {
|
||||||
|
boolean interrupted = false;
|
||||||
|
try {
|
||||||
|
synchronized (this) {
|
||||||
|
while (!hasChanged) {
|
||||||
|
wait();
|
||||||
|
}
|
||||||
|
hasChanged = false;
|
||||||
|
}
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
LOG.warn("Interrupted", e);
|
||||||
|
interrupted = true;
|
||||||
|
}
|
||||||
|
if (!isMasterRunning(masterServices) || interrupted) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// First, wait a while in case more servers are about to rejoin the cluster
|
||||||
|
try {
|
||||||
|
Thread.sleep(waitInterval);
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
LOG.warn("Interrupted", e);
|
||||||
|
}
|
||||||
|
if (!isMasterRunning(masterServices)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Kick all regions in FAILED_OPEN state
|
||||||
|
updateFailedAssignments();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public void serverChanged() {
|
||||||
|
synchronized (this) {
|
||||||
|
hasChanged = true;
|
||||||
|
this.notify();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private class RSGroupStartupWorker extends Thread {
|
private class RSGroupStartupWorker extends Thread {
|
||||||
private final Log LOG = LogFactory.getLog(RSGroupStartupWorker.class);
|
private final Log LOG = LogFactory.getLog(RSGroupStartupWorker.class);
|
||||||
private volatile boolean online = false;
|
private volatile boolean online = false;
|
||||||
|
|
Loading…
Reference in New Issue