HBASE-23261 Region stuck in transition while splitting
Processing ZK BadVersionException during node transition Signed-off-by: Andrew Purtell <apurtell@apache.org>
This commit is contained in:
parent
b566a4f9bc
commit
8e60b0c0c6
|
@ -868,7 +868,15 @@ public class ZKAssign {
|
|||
try {
|
||||
rt = RegionTransition.createRegionTransition(
|
||||
endState, region.getRegionName(), serverName, payload);
|
||||
if(!ZKUtil.setData(zkw, node, rt.toByteArray(), stat.getVersion())) {
|
||||
boolean isDataSet;
|
||||
try {
|
||||
isDataSet = ZKUtil.setData(zkw, node, rt.toByteArray(), stat.getVersion());
|
||||
} catch (KeeperException.BadVersionException e) {
|
||||
isDataSet = false;
|
||||
LOG.error("Received BadVersionException from ZK for " + encoded
|
||||
+ ", version: " + stat.getVersion());
|
||||
}
|
||||
if (!isDataSet) {
|
||||
LOG.warn(zkw.prefix("Attempt to transition the " +
|
||||
"unassigned node for " + encoded +
|
||||
" from " + beginState + " to " + endState + " failed, " +
|
||||
|
|
|
@ -40,6 +40,13 @@ public class ZKSplitTransactionCoordination implements SplitTransactionCoordinat
|
|||
private CoordinatedStateManager coordinationManager;
|
||||
private final ZooKeeperWatcher watcher;
|
||||
|
||||
// max wait for split transaction - 100 times in a loop with 100 ms of thread sleep each time
|
||||
// this accounts for ~24 s due to calls involved in loop. even for busy cluster, by this time,
|
||||
// we should have been able to complete setData() In fact, ideally, 2nd retry after failed
|
||||
// attempt should be sufficient to retrieve correct ZK node version and successfully updating
|
||||
// RIT info in ZK node.
|
||||
private static final int SPIN_WAIT_TIMEOUT = 100;
|
||||
|
||||
private static final Log LOG = LogFactory.getLog(ZKSplitTransactionCoordination.class);
|
||||
|
||||
public ZKSplitTransactionCoordination(CoordinatedStateManager coordinationProvider,
|
||||
|
@ -163,6 +170,10 @@ public class ZKSplitTransactionCoordination implements SplitTransactionCoordinat
|
|||
}
|
||||
Thread.sleep(100);
|
||||
spins++;
|
||||
if (spins > SPIN_WAIT_TIMEOUT) {
|
||||
throw new IOException("Waiting time for Split Transaction exceeded for region: "
|
||||
+ parent.getRegionInfo().getRegionNameAsString());
|
||||
}
|
||||
byte[] data = ZKAssign.getDataNoWatch(watcher, node, stat);
|
||||
if (data == null) {
|
||||
throw new IOException("Data is null, splitting node " + node + " no longer exists");
|
||||
|
@ -222,9 +233,14 @@ public class ZKSplitTransactionCoordination implements SplitTransactionCoordinat
|
|||
// Tell master about split by updating zk. If we fail, abort.
|
||||
if (coordinationManager.getServer() != null) {
|
||||
try {
|
||||
zstd.setZnodeVersion(transitionSplittingNode(parent.getRegionInfo(), a.getRegionInfo(),
|
||||
int newNodeVersion = transitionSplittingNode(parent.getRegionInfo(), a.getRegionInfo(),
|
||||
b.getRegionInfo(), coordinationManager.getServer().getServerName(), zstd,
|
||||
RS_ZK_REGION_SPLITTING, RS_ZK_REGION_SPLIT));
|
||||
RS_ZK_REGION_SPLITTING, RS_ZK_REGION_SPLIT);
|
||||
if (newNodeVersion == -1) {
|
||||
throw new IOException("Notifying master of RS split failed for region: "
|
||||
+ parent.getRegionInfo().getRegionNameAsString());
|
||||
}
|
||||
zstd.setZnodeVersion(newNodeVersion);
|
||||
|
||||
int spins = 0;
|
||||
// Now wait for the master to process the split. We know it's done
|
||||
|
|
Loading…
Reference in New Issue