HBASE-3181 Review, document, and fix up Regions-in-Transition timeout logic
git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1029938 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
18a78fef4d
commit
8387a5b18f
|
@ -1085,6 +1085,9 @@ Release 0.21.0 - Unreleased
|
||||||
HBASE-3184 Xmx setting in pom to use for tests/surefire does not appear
|
HBASE-3184 Xmx setting in pom to use for tests/surefire does not appear
|
||||||
to work
|
to work
|
||||||
HBASE-3120 [rest] Content transcoding
|
HBASE-3120 [rest] Content transcoding
|
||||||
|
HBASE-3181 Review, document, and fix up Regions-in-Transition timeout
|
||||||
|
logic
|
||||||
|
|
||||||
|
|
||||||
NEW FEATURES
|
NEW FEATURES
|
||||||
HBASE-1961 HBase EC2 scripts
|
HBASE-1961 HBase EC2 scripts
|
||||||
|
|
|
@ -498,9 +498,9 @@ public class MetaReader {
|
||||||
Result result;
|
Result result;
|
||||||
while((result = metaServer.next(scannerid)) != null) {
|
while((result = metaServer.next(scannerid)) != null) {
|
||||||
if (result != null && result.size() > 0) {
|
if (result != null && result.size() > 0) {
|
||||||
Pair<HRegionInfo, HServerAddress> pair = metaRowToRegionPair(result);
|
Pair<HRegionInfo, HServerInfo> pair =
|
||||||
if (pair.getSecond() == null ||
|
metaRowToRegionPairWithInfo(result);
|
||||||
!pair.getSecond().equals(hsi.getServerAddress())) {
|
if (pair.getSecond() == null || !pair.getSecond().equals(hsi)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
hris.put(pair.getFirst(), result);
|
hris.put(pair.getFirst(), result);
|
||||||
|
|
|
@ -34,6 +34,7 @@ import java.util.NavigableMap;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.SortedMap;
|
import java.util.SortedMap;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
|
import java.util.TreeSet;
|
||||||
import java.util.concurrent.ConcurrentNavigableMap;
|
import java.util.concurrent.ConcurrentNavigableMap;
|
||||||
import java.util.concurrent.ConcurrentSkipListMap;
|
import java.util.concurrent.ConcurrentSkipListMap;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
|
@ -97,13 +98,19 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
|
|
||||||
private TimeoutMonitor timeoutMonitor;
|
private TimeoutMonitor timeoutMonitor;
|
||||||
|
|
||||||
/** Regions currently in transition. */
|
/**
|
||||||
|
* Regions currently in transition. Map of encoded region names to the master
|
||||||
|
* in-memory state for that region.
|
||||||
|
*/
|
||||||
final ConcurrentSkipListMap<String, RegionState> regionsInTransition =
|
final ConcurrentSkipListMap<String, RegionState> regionsInTransition =
|
||||||
new ConcurrentSkipListMap<String, RegionState>();
|
new ConcurrentSkipListMap<String, RegionState>();
|
||||||
|
|
||||||
/** Plans for region movement. Key is the encoded version of a region name*/
|
/** Plans for region movement. Key is the encoded version of a region name*/
|
||||||
// TODO: When do plans get cleaned out? Ever? In server open and in server
|
// TODO: When do plans get cleaned out? Ever? In server open and in server
|
||||||
// shutdown processing -- St.Ack
|
// shutdown processing -- St.Ack
|
||||||
|
// TODO: Better to just synchronize access around regionPlans? I think that
|
||||||
|
// would be better than a concurrent structure since we do more than
|
||||||
|
// one operation at a time -- jgray
|
||||||
final ConcurrentNavigableMap<String, RegionPlan> regionPlans =
|
final ConcurrentNavigableMap<String, RegionPlan> regionPlans =
|
||||||
new ConcurrentSkipListMap<String, RegionPlan>();
|
new ConcurrentSkipListMap<String, RegionPlan>();
|
||||||
|
|
||||||
|
@ -152,9 +159,9 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
this.executorService = service;
|
this.executorService = service;
|
||||||
Configuration conf = master.getConfiguration();
|
Configuration conf = master.getConfiguration();
|
||||||
this.timeoutMonitor = new TimeoutMonitor(
|
this.timeoutMonitor = new TimeoutMonitor(
|
||||||
conf.getInt("hbase.master.assignment.timeoutmonitor.period", 30000),
|
conf.getInt("hbase.master.assignment.timeoutmonitor.period", 10000),
|
||||||
master,
|
master,
|
||||||
conf.getInt("hbase.master.assignment.timeoutmonitor.timeout", 15000));
|
conf.getInt("hbase.master.assignment.timeoutmonitor.timeout", 30000));
|
||||||
Threads.setDaemonThreadRunning(timeoutMonitor,
|
Threads.setDaemonThreadRunning(timeoutMonitor,
|
||||||
master.getServerName() + ".timeoutMonitor");
|
master.getServerName() + ".timeoutMonitor");
|
||||||
}
|
}
|
||||||
|
@ -272,14 +279,14 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
// Region is closed, insert into RIT and handle it
|
// Region is closed, insert into RIT and handle it
|
||||||
regionsInTransition.put(encodedRegionName, new RegionState(
|
regionsInTransition.put(encodedRegionName, new RegionState(
|
||||||
regionInfo, RegionState.State.CLOSED, data.getStamp()));
|
regionInfo, RegionState.State.CLOSED, data.getStamp()));
|
||||||
new ClosedRegionHandler(master, this, data, regionInfo).process();
|
new ClosedRegionHandler(master, this, regionInfo).process();
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case M_ZK_REGION_OFFLINE:
|
case M_ZK_REGION_OFFLINE:
|
||||||
// Region is offline, insert into RIT and handle it like a closed
|
// Region is offline, insert into RIT and handle it like a closed
|
||||||
regionsInTransition.put(encodedRegionName, new RegionState(
|
regionsInTransition.put(encodedRegionName, new RegionState(
|
||||||
regionInfo, RegionState.State.OFFLINE, data.getStamp()));
|
regionInfo, RegionState.State.OFFLINE, data.getStamp()));
|
||||||
new ClosedRegionHandler(master, this, data, regionInfo).process();
|
new ClosedRegionHandler(master, this, regionInfo).process();
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case RS_ZK_REGION_OPENING:
|
case RS_ZK_REGION_OPENING:
|
||||||
|
@ -303,7 +310,7 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
"; letting RIT timeout so will be assigned elsewhere");
|
"; letting RIT timeout so will be assigned elsewhere");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
new OpenedRegionHandler(master, this, data, regionInfo, hsi).process();
|
new OpenedRegionHandler(master, this, regionInfo, hsi).process();
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -367,7 +374,7 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
// what follows will fail because not in expected state.
|
// what follows will fail because not in expected state.
|
||||||
regionState.update(RegionState.State.CLOSED, data.getStamp());
|
regionState.update(RegionState.State.CLOSED, data.getStamp());
|
||||||
this.executorService.submit(new ClosedRegionHandler(master,
|
this.executorService.submit(new ClosedRegionHandler(master,
|
||||||
this, data, regionState.getRegion()));
|
this, regionState.getRegion()));
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case RS_ZK_REGION_OPENING:
|
case RS_ZK_REGION_OPENING:
|
||||||
|
@ -400,7 +407,7 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
// Handle OPENED by removing from transition and deleted zk node
|
// Handle OPENED by removing from transition and deleted zk node
|
||||||
regionState.update(RegionState.State.OPEN, data.getStamp());
|
regionState.update(RegionState.State.OPEN, data.getStamp());
|
||||||
this.executorService.submit(
|
this.executorService.submit(
|
||||||
new OpenedRegionHandler(master, this, data, regionState.getRegion(),
|
new OpenedRegionHandler(master, this, regionState.getRegion(),
|
||||||
this.serverManager.getServerInfo(data.getServerName())));
|
this.serverManager.getServerInfo(data.getServerName())));
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
@ -600,7 +607,8 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
public void offlineDisabledRegion(HRegionInfo regionInfo) {
|
public void offlineDisabledRegion(HRegionInfo regionInfo) {
|
||||||
// Disabling so should not be reassigned, just delete the CLOSED node
|
// Disabling so should not be reassigned, just delete the CLOSED node
|
||||||
LOG.debug("Table being disabled so deleting ZK node and removing from " +
|
LOG.debug("Table being disabled so deleting ZK node and removing from " +
|
||||||
"regions in transition, skipping assignment");
|
"regions in transition, skipping assignment of region " +
|
||||||
|
regionInfo.getRegionNameAsString());
|
||||||
try {
|
try {
|
||||||
if (!ZKAssign.deleteClosedNode(watcher, regionInfo.getEncodedName())) {
|
if (!ZKAssign.deleteClosedNode(watcher, regionInfo.getEncodedName())) {
|
||||||
// Could also be in OFFLINE mode
|
// Could also be in OFFLINE mode
|
||||||
|
@ -632,8 +640,15 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
* in-memory checks pass, the zk node is forced to OFFLINE before assigning.
|
* in-memory checks pass, the zk node is forced to OFFLINE before assigning.
|
||||||
*
|
*
|
||||||
* @param regionName server to be assigned
|
* @param regionName server to be assigned
|
||||||
|
* @param setOfflineInZK whether ZK node should be created/transitioned to an
|
||||||
|
* OFFLINE state before assigning the region
|
||||||
*/
|
*/
|
||||||
public void assign(HRegionInfo region) {
|
public void assign(HRegionInfo region, boolean setOfflineInZK) {
|
||||||
|
assign(region, setOfflineInZK, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
public void assign(HRegionInfo region, boolean setOfflineInZK,
|
||||||
|
boolean forceNewPlan) {
|
||||||
String tableName = region.getTableDesc().getNameAsString();
|
String tableName = region.getTableDesc().getNameAsString();
|
||||||
if (isTableDisabled(tableName)) {
|
if (isTableDisabled(tableName)) {
|
||||||
LOG.info("Table " + tableName + " disabled; skipping assign of " +
|
LOG.info("Table " + tableName + " disabled; skipping assign of " +
|
||||||
|
@ -648,7 +663,7 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
}
|
}
|
||||||
RegionState state = addToRegionsInTransition(region);
|
RegionState state = addToRegionsInTransition(region);
|
||||||
synchronized (state) {
|
synchronized (state) {
|
||||||
assign(state);
|
assign(state, setOfflineInZK, forceNewPlan);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -800,13 +815,14 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
* Caller must hold lock on the passed <code>state</code> object.
|
* Caller must hold lock on the passed <code>state</code> object.
|
||||||
* @param state
|
* @param state
|
||||||
*/
|
*/
|
||||||
private void assign(final RegionState state) {
|
private void assign(final RegionState state, final boolean setOfflineInZK,
|
||||||
if (!setOfflineInZooKeeper(state)) return;
|
final boolean forceNewPlan) {
|
||||||
|
if (setOfflineInZK && !setOfflineInZooKeeper(state)) return;
|
||||||
if (this.master.isStopped()) {
|
if (this.master.isStopped()) {
|
||||||
LOG.debug("Server stopped; skipping assign of " + state);
|
LOG.debug("Server stopped; skipping assign of " + state);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
RegionPlan plan = getRegionPlan(state);
|
RegionPlan plan = getRegionPlan(state, forceNewPlan);
|
||||||
if (plan == null) return; // Should get reassigned later when RIT times out.
|
if (plan == null) return; // Should get reassigned later when RIT times out.
|
||||||
try {
|
try {
|
||||||
LOG.debug("Assigning region " + state.getRegion().getRegionNameAsString() +
|
LOG.debug("Assigning region " + state.getRegion().getRegionNameAsString() +
|
||||||
|
@ -823,12 +839,13 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
// succeed anyways; we need a new plan!
|
// succeed anyways; we need a new plan!
|
||||||
// Transition back to OFFLINE
|
// Transition back to OFFLINE
|
||||||
state.update(RegionState.State.OFFLINE);
|
state.update(RegionState.State.OFFLINE);
|
||||||
// Remove the plan
|
// Force a new plan and reassign.
|
||||||
this.regionPlans.remove(state.getRegion().getEncodedName());
|
if (getRegionPlan(state, plan.getDestination(), true) == null) {
|
||||||
// Put in place a new plan and reassign. Calling getRegionPlan will add
|
LOG.warn("Unable to find a viable location to assign region " +
|
||||||
// a plan if none exists (We removed it in line above).
|
state.getRegion().getRegionNameAsString());
|
||||||
if (getRegionPlan(state, plan.getDestination()) == null) return;
|
return;
|
||||||
assign(state);
|
}
|
||||||
|
assign(state, false, false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -890,43 +907,48 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
* @return Plan for passed <code>state</code> (If none currently, it creates one or
|
* @return Plan for passed <code>state</code> (If none currently, it creates one or
|
||||||
* if no servers to assign, it returns null).
|
* if no servers to assign, it returns null).
|
||||||
*/
|
*/
|
||||||
RegionPlan getRegionPlan(final RegionState state) {
|
RegionPlan getRegionPlan(final RegionState state,
|
||||||
return getRegionPlan(state, null);
|
final boolean forceNewPlan) {
|
||||||
|
return getRegionPlan(state, null, forceNewPlan);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @param state
|
* @param state
|
||||||
* @param serverToExclude Server to exclude (we know its bad). Pass null if
|
* @param serverToExclude Server to exclude (we know its bad). Pass null if
|
||||||
* all servers are thought to be assignable.
|
* all servers are thought to be assignable.
|
||||||
|
* @param forceNewPlan If true, then if an existing plan exists, a new plan
|
||||||
|
* will be generated.
|
||||||
* @return Plan for passed <code>state</code> (If none currently, it creates one or
|
* @return Plan for passed <code>state</code> (If none currently, it creates one or
|
||||||
* if no servers to assign, it returns null).
|
* if no servers to assign, it returns null).
|
||||||
*/
|
*/
|
||||||
RegionPlan getRegionPlan(final RegionState state,
|
RegionPlan getRegionPlan(final RegionState state,
|
||||||
final HServerInfo serverToExclude) {
|
final HServerInfo serverToExclude, final boolean forceNewPlan) {
|
||||||
// Pickup existing plan or make a new one
|
// Pickup existing plan or make a new one
|
||||||
String encodedName = state.getRegion().getEncodedName();
|
String encodedName = state.getRegion().getEncodedName();
|
||||||
List<HServerInfo> servers = this.serverManager.getOnlineServersList();
|
List<HServerInfo> servers = this.serverManager.getOnlineServersList();
|
||||||
// The remove below hinges on the fact that the call to
|
// The remove below hinges on the fact that the call to
|
||||||
// serverManager.getOnlineServersList() returns a copy
|
// serverManager.getOnlineServersList() returns a copy
|
||||||
if (serverToExclude != null) servers.remove(serverToExclude);
|
if (serverToExclude != null) servers.remove(serverToExclude);
|
||||||
if (servers.size() < 0) return null;
|
if (servers.size() <= 0) return null;
|
||||||
RegionPlan newPlan = new RegionPlan(state.getRegion(), null,
|
RegionPlan randomPlan = new RegionPlan(state.getRegion(), null,
|
||||||
LoadBalancer.randomAssignment(servers));
|
LoadBalancer.randomAssignment(servers));
|
||||||
RegionPlan existingPlan = this.regionPlans.putIfAbsent(encodedName, newPlan);
|
synchronized (this.regionPlans) {
|
||||||
RegionPlan plan = null;
|
RegionPlan existingPlan = this.regionPlans.get(encodedName);
|
||||||
if (existingPlan == null) {
|
if (existingPlan == null || forceNewPlan ||
|
||||||
LOG.debug("No previous transition plan for " +
|
existingPlan.getDestination().equals(serverToExclude)) {
|
||||||
state.getRegion().getRegionNameAsString() +
|
LOG.debug("No previous transition plan was found (or we are ignoring " +
|
||||||
" so generated a random one; " + newPlan + "; " +
|
"an existing plan) for " + state.getRegion().getRegionNameAsString()
|
||||||
serverManager.countOfRegionServers() +
|
+ " so generated a random one; " + randomPlan + "; " +
|
||||||
" (online=" + serverManager.getOnlineServers().size() +
|
serverManager.countOfRegionServers() +
|
||||||
", exclude=" + serverToExclude + ") available servers");
|
" (online=" + serverManager.getOnlineServers().size() +
|
||||||
plan = newPlan;
|
", exclude=" + serverToExclude + ") available servers");
|
||||||
} else {
|
this.regionPlans.put(encodedName, randomPlan);
|
||||||
LOG.debug("Using preexisting plan=" + existingPlan);
|
return randomPlan;
|
||||||
plan = existingPlan;
|
}
|
||||||
|
LOG.debug("Using pre-exisitng plan for region " +
|
||||||
|
state.getRegion().getRegionNameAsString() + "; plan=" + existingPlan);
|
||||||
|
return existingPlan;
|
||||||
}
|
}
|
||||||
return plan;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -974,10 +996,10 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
if (state == null) {
|
if (state == null) {
|
||||||
state = new RegionState(region, RegionState.State.PENDING_CLOSE);
|
state = new RegionState(region, RegionState.State.PENDING_CLOSE);
|
||||||
regionsInTransition.put(encodedName, state);
|
regionsInTransition.put(encodedName, state);
|
||||||
} else if (force && (state.isClosing() || state.isPendingClose())) {
|
} else if (force && state.isPendingClose()) {
|
||||||
LOG.debug("Attempting to unassign region " +
|
LOG.debug("Attempting to unassign region " +
|
||||||
region.getRegionNameAsString() + " which is already closing but " +
|
region.getRegionNameAsString() + " which is already pending close "
|
||||||
"forcing an additional close");
|
+ "but forcing an additional close");
|
||||||
state.update(RegionState.State.PENDING_CLOSE);
|
state.update(RegionState.State.PENDING_CLOSE);
|
||||||
} else {
|
} else {
|
||||||
LOG.debug("Attempting to unassign region " +
|
LOG.debug("Attempting to unassign region " +
|
||||||
|
@ -987,20 +1009,26 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Send CLOSE RPC
|
// Send CLOSE RPC
|
||||||
|
HServerInfo server = null;
|
||||||
|
synchronized (this.regions) {
|
||||||
|
server = regions.get(region);
|
||||||
|
}
|
||||||
try {
|
try {
|
||||||
// TODO: We should consider making this look more like it does for the
|
// TODO: We should consider making this look more like it does for the
|
||||||
// region open where we catch all throwables and never abort
|
// region open where we catch all throwables and never abort
|
||||||
if(serverManager.sendRegionClose(regions.get(region),
|
if(serverManager.sendRegionClose(server, state.getRegion())) {
|
||||||
state.getRegion())) {
|
LOG.debug("Sent CLOSE to " + server + " for region " +
|
||||||
LOG.debug("Sent CLOSE to " + regions.get(region) + " for region " +
|
|
||||||
region.getRegionNameAsString());
|
region.getRegionNameAsString());
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
LOG.debug("Server " + server + " region CLOSE RPC returned false");
|
||||||
} catch (NotServingRegionException nsre) {
|
} catch (NotServingRegionException nsre) {
|
||||||
// Failed to close, so pass through and reassign
|
// Failed to close, so pass through and reassign
|
||||||
|
LOG.debug("Server " + server + " returned NotServingRegionException");
|
||||||
} catch (RemoteException re) {
|
} catch (RemoteException re) {
|
||||||
if (re.unwrapRemoteException() instanceof NotServingRegionException) {
|
if (re.unwrapRemoteException() instanceof NotServingRegionException) {
|
||||||
// Failed to close, so pass through and reassign
|
// Failed to close, so pass through and reassign
|
||||||
|
LOG.debug("Server " + server + " returned NotServingRegionException");
|
||||||
} else {
|
} else {
|
||||||
this.master.abort("Remote unexpected exception",
|
this.master.abort("Remote unexpected exception",
|
||||||
re.unwrapRemoteException());
|
re.unwrapRemoteException());
|
||||||
|
@ -1011,13 +1039,13 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
this.master.abort("Remote unexpected exception", t);
|
this.master.abort("Remote unexpected exception", t);
|
||||||
}
|
}
|
||||||
// Did not CLOSE, so set region offline and assign it
|
// Did not CLOSE, so set region offline and assign it
|
||||||
LOG.debug("Attempted to send CLOSE to " + regions.get(region) +
|
LOG.debug("Attempted to send CLOSE to " + server +
|
||||||
" for region " + region.getRegionNameAsString() + " but failed, " +
|
" for region " + region.getRegionNameAsString() + " but failed, " +
|
||||||
"setting region as OFFLINE and reassigning");
|
"setting region as OFFLINE and reassigning");
|
||||||
synchronized (regionsInTransition) {
|
synchronized (regionsInTransition) {
|
||||||
forceRegionStateToOffline(region);
|
forceRegionStateToOffline(region);
|
||||||
assign(region);
|
|
||||||
}
|
}
|
||||||
|
assign(region, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1049,7 +1077,7 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
*/
|
*/
|
||||||
public void assignRoot() throws KeeperException {
|
public void assignRoot() throws KeeperException {
|
||||||
RootLocationEditor.deleteRootLocation(this.master.getZooKeeper());
|
RootLocationEditor.deleteRootLocation(this.master.getZooKeeper());
|
||||||
assign(HRegionInfo.ROOT_REGIONINFO);
|
assign(HRegionInfo.ROOT_REGIONINFO, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1062,7 +1090,7 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
*/
|
*/
|
||||||
public void assignMeta() {
|
public void assignMeta() {
|
||||||
// Force assignment to a random server
|
// Force assignment to a random server
|
||||||
assign(HRegionInfo.FIRST_META_REGIONINFO);
|
assign(HRegionInfo.FIRST_META_REGIONINFO, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -1460,67 +1488,73 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
LOG.info("Regions in transition timed out: " + regionState);
|
LOG.info("Regions in transition timed out: " + regionState);
|
||||||
// Expired! Do a retry.
|
// Expired! Do a retry.
|
||||||
switch (regionState.getState()) {
|
switch (regionState.getState()) {
|
||||||
case OFFLINE:
|
|
||||||
case CLOSED:
|
case CLOSED:
|
||||||
LOG.info("Region has been OFFLINE or CLOSED for too long, " +
|
LOG.info("Region has been CLOSED for too long, " +
|
||||||
"reassigning " + regionInfo.getRegionNameAsString());
|
"retriggering ClosedRegionHandler");
|
||||||
assign(regionState.getRegion());
|
AssignmentManager.this.executorService.submit(
|
||||||
|
new ClosedRegionHandler(master, AssignmentManager.this,
|
||||||
|
regionState.getRegion()));
|
||||||
|
break;
|
||||||
|
case OFFLINE:
|
||||||
|
LOG.info("Region has been OFFLINE for too long, " +
|
||||||
|
"reassigning " + regionInfo.getRegionNameAsString() +
|
||||||
|
" to a random server");
|
||||||
|
assign(regionState.getRegion(), false);
|
||||||
break;
|
break;
|
||||||
case PENDING_OPEN:
|
case PENDING_OPEN:
|
||||||
LOG.info("Region has been PENDING_OPEN for too " +
|
LOG.info("Region has been PENDING_OPEN for too " +
|
||||||
"long, reassigning region=" +
|
"long, reassigning region=" +
|
||||||
regionInfo.getRegionNameAsString());
|
regionInfo.getRegionNameAsString());
|
||||||
// Should have a ZK node in OFFLINE state or no node at all
|
assign(regionState.getRegion(), false, true);
|
||||||
try {
|
break;
|
||||||
if (ZKUtil.watchAndCheckExists(watcher,
|
|
||||||
ZKAssign.getNodeName(watcher,
|
|
||||||
regionInfo.getEncodedName())) &&
|
|
||||||
!ZKAssign.verifyRegionState(watcher, regionInfo,
|
|
||||||
EventType.M_ZK_REGION_OFFLINE)) {
|
|
||||||
LOG.info("Region exists and not in expected OFFLINE " +
|
|
||||||
"state so skipping timeout, region=" +
|
|
||||||
regionInfo.getRegionNameAsString());
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} catch (KeeperException ke) {
|
|
||||||
LOG.error("Unexpected ZK exception timing out " +
|
|
||||||
"PENDING_CLOSE region",
|
|
||||||
ke);
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
AssignmentManager.this.setOffline(regionState.getRegion());
|
|
||||||
regionState.update(RegionState.State.OFFLINE);
|
|
||||||
assign(regionState.getRegion());
|
|
||||||
break;
|
|
||||||
case OPENING:
|
case OPENING:
|
||||||
LOG.info("Region has been OPENING for too " +
|
LOG.info("Region has been OPENING for too " +
|
||||||
"long, reassigning region=" +
|
"long, reassigning region=" +
|
||||||
regionInfo.getRegionNameAsString());
|
regionInfo.getRegionNameAsString());
|
||||||
// Should have a ZK node in OPENING state
|
// Should have a ZK node in OPENING state
|
||||||
try {
|
try {
|
||||||
if (ZKUtil.watchAndCheckExists(watcher,
|
String node = ZKAssign.getNodeName(watcher,
|
||||||
ZKAssign.getNodeName(watcher,
|
regionInfo.getEncodedName());
|
||||||
regionInfo.getEncodedName())) &&
|
Stat stat = new Stat();
|
||||||
ZKAssign.transitionNode(watcher, regionInfo,
|
RegionTransitionData data = ZKAssign.getDataNoWatch(watcher,
|
||||||
HMaster.MASTER, EventType.RS_ZK_REGION_OPENING,
|
node, stat);
|
||||||
EventType.M_ZK_REGION_OFFLINE, -1) == -1) {
|
if (data.getEventType() == EventType.RS_ZK_REGION_OPENED) {
|
||||||
LOG.info("Region transitioned out of OPENING so " +
|
LOG.debug("Region has transitioned to OPENED, allowing " +
|
||||||
"skipping timeout, region=" +
|
"watched event handlers to process");
|
||||||
regionInfo.getRegionNameAsString());
|
|
||||||
break;
|
break;
|
||||||
|
} else if (data.getEventType() !=
|
||||||
|
EventType.RS_ZK_REGION_OPENING) {
|
||||||
|
LOG.warn("While timing out a region in state OPENING, " +
|
||||||
|
"found ZK node in unexpected state: " +
|
||||||
|
data.getEventType());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// Attempt to transition node into OFFLINE
|
||||||
|
try {
|
||||||
|
data = new RegionTransitionData(
|
||||||
|
EventType.M_ZK_REGION_OFFLINE,
|
||||||
|
regionInfo.getRegionName());
|
||||||
|
if (ZKUtil.setData(watcher, node, data.getBytes(),
|
||||||
|
stat.getVersion())) {
|
||||||
|
// Node is now OFFLINE, let's trigger another assignment
|
||||||
|
ZKUtil.getDataAndWatch(watcher, node); // re-set the watch
|
||||||
|
LOG.info("Successfully transitioned region=" +
|
||||||
|
regionInfo.getRegionNameAsString() + " into OFFLINE" +
|
||||||
|
" and forcing a new assignment");
|
||||||
|
assign(regionState, false, true);
|
||||||
|
}
|
||||||
|
} catch (KeeperException.NoNodeException nne) {
|
||||||
|
// Node did not exist, can't time this out
|
||||||
}
|
}
|
||||||
} catch (KeeperException ke) {
|
} catch (KeeperException ke) {
|
||||||
LOG.error("Unexpected ZK exception timing out CLOSING region",
|
LOG.error("Unexpected ZK exception timing out CLOSING region",
|
||||||
ke);
|
ke);
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
AssignmentManager.this.setOffline(regionState.getRegion());
|
|
||||||
regionState.update(RegionState.State.OFFLINE);
|
|
||||||
assign(regionState.getRegion());
|
|
||||||
break;
|
break;
|
||||||
case OPEN:
|
case OPEN:
|
||||||
LOG.warn("Long-running region in OPEN state? This should " +
|
LOG.error("Region has been OPEN for too long, " +
|
||||||
"not happen; region=" + regionInfo.getRegionNameAsString());
|
"we don't know where region was opened so can't do anything");
|
||||||
break;
|
break;
|
||||||
case PENDING_CLOSE:
|
case PENDING_CLOSE:
|
||||||
LOG.info("Region has been PENDING_CLOSE for too " +
|
LOG.info("Region has been PENDING_CLOSE for too " +
|
||||||
|
@ -1544,20 +1578,8 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
break;
|
break;
|
||||||
case CLOSING:
|
case CLOSING:
|
||||||
LOG.info("Region has been CLOSING for too " +
|
LOG.info("Region has been CLOSING for too " +
|
||||||
"long, running forced unassign again on region=" +
|
"long, this should eventually complete or the server will " +
|
||||||
regionInfo.getRegionNameAsString());
|
"expire, doing nothing");
|
||||||
try {
|
|
||||||
if (ZKAssign.deleteClosingNode(watcher,
|
|
||||||
regionInfo.getEncodedName())) {
|
|
||||||
unassign(regionInfo, true);
|
|
||||||
}
|
|
||||||
} catch (NoNodeException e) {
|
|
||||||
LOG.debug("Node no longer existed so not forcing another " +
|
|
||||||
"unassignment");
|
|
||||||
} catch (KeeperException e) {
|
|
||||||
LOG.warn("Unexpected ZK exception timing out a region " +
|
|
||||||
"close", e);
|
|
||||||
}
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1569,54 +1591,42 @@ public class AssignmentManager extends ZooKeeperListener {
|
||||||
/**
|
/**
|
||||||
* Process shutdown server removing any assignments.
|
* Process shutdown server removing any assignments.
|
||||||
* @param hsi Server that went down.
|
* @param hsi Server that went down.
|
||||||
|
* @return set of regions on this server that are not in transition
|
||||||
*/
|
*/
|
||||||
public void processServerShutdown(final HServerInfo hsi) {
|
public List<HRegionInfo> processServerShutdown(final HServerInfo hsi) {
|
||||||
// Clean out any exisiting assignment plans for this server
|
// Clean out any existing assignment plans for this server
|
||||||
for (Iterator <Map.Entry<String, RegionPlan>> i =
|
synchronized (this.regionPlans) {
|
||||||
this.regionPlans.entrySet().iterator(); i.hasNext();) {
|
for (Iterator <Map.Entry<String, RegionPlan>> i =
|
||||||
Map.Entry<String, RegionPlan> e = i.next();
|
this.regionPlans.entrySet().iterator(); i.hasNext();) {
|
||||||
if (e.getValue().getDestination().equals(hsi)) {
|
Map.Entry<String, RegionPlan> e = i.next();
|
||||||
// Use iterator's remove else we'll get CME
|
if (e.getValue().getDestination().equals(hsi)) {
|
||||||
i.remove();
|
// Use iterator's remove else we'll get CME
|
||||||
}
|
i.remove();
|
||||||
}
|
|
||||||
// Remove assignment info related to the downed server. Remove the downed
|
|
||||||
// server from list of servers else it looks like a server w/ no load.
|
|
||||||
synchronized (this.regions) {
|
|
||||||
Set<HRegionInfo> hris = new HashSet<HRegionInfo>();
|
|
||||||
for (Map.Entry<HRegionInfo, HServerInfo> e: this.regions.entrySet()) {
|
|
||||||
// Add to a Set -- don't call setOffline in here else we get a CME.
|
|
||||||
if (e.getValue().equals(hsi)) hris.add(e.getKey());
|
|
||||||
}
|
|
||||||
for (HRegionInfo hri: hris) setOffline(hri);
|
|
||||||
this.servers.remove(hsi);
|
|
||||||
}
|
|
||||||
// If anything in transition related to the server, clean it up.
|
|
||||||
synchronized (regionsInTransition) {
|
|
||||||
// Iterate all regions in transition checking if were on this server
|
|
||||||
final String serverName = hsi.getServerName();
|
|
||||||
for (Map.Entry<String, RegionState> e: this.regionsInTransition.entrySet()) {
|
|
||||||
if (!e.getKey().equals(serverName)) continue;
|
|
||||||
RegionState regionState = e.getValue();
|
|
||||||
switch(regionState.getState()) {
|
|
||||||
case PENDING_OPEN:
|
|
||||||
case OPENING:
|
|
||||||
case OFFLINE:
|
|
||||||
case CLOSED:
|
|
||||||
case PENDING_CLOSE:
|
|
||||||
case CLOSING:
|
|
||||||
LOG.info("Region " + regionState.getRegion().getRegionNameAsString() +
|
|
||||||
" was in state=" + regionState.getState() + " on shutdown server=" +
|
|
||||||
serverName + ", reassigning");
|
|
||||||
assign(regionState.getRegion());
|
|
||||||
break;
|
|
||||||
|
|
||||||
case OPEN:
|
|
||||||
LOG.warn("Long-running region in OPEN state? Should not happen");
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// TODO: Do we want to sync on RIT here?
|
||||||
|
// Remove this server from map of servers to regions, and remove all regions
|
||||||
|
// of this server from online map of regions.
|
||||||
|
Set<HRegionInfo> deadRegions = null;
|
||||||
|
synchronized (this.regions) {
|
||||||
|
deadRegions = new TreeSet<HRegionInfo>(this.servers.remove(hsi));
|
||||||
|
for (HRegionInfo region : deadRegions) {
|
||||||
|
this.regions.remove(region);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// See if any of the regions that were online on this server were in RIT
|
||||||
|
// If they are, normal timeouts will deal with them appropriately so
|
||||||
|
// let's skip a manual re-assignment.
|
||||||
|
List<HRegionInfo> rits = new ArrayList<HRegionInfo>();
|
||||||
|
synchronized (regionsInTransition) {
|
||||||
|
for (RegionState region : this.regionsInTransition.values()) {
|
||||||
|
if (deadRegions.remove(region.getRegion())) {
|
||||||
|
rits.add(region.getRegion());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return rits;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -751,7 +751,7 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
|
||||||
region.getLog().closeAndDelete();
|
region.getLog().closeAndDelete();
|
||||||
|
|
||||||
// 4. Trigger immediate assignment of this region
|
// 4. Trigger immediate assignment of this region
|
||||||
assignmentManager.assign(region.getRegionInfo());
|
assignmentManager.assign(region.getRegionInfo(), true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// 5. If sync, wait for assignment of regions
|
// 5. If sync, wait for assignment of regions
|
||||||
|
@ -958,7 +958,7 @@ implements HMasterInterface, HMasterRegionInterface, MasterServices, Server {
|
||||||
}
|
}
|
||||||
|
|
||||||
public void assignRegion(HRegionInfo hri) {
|
public void assignRegion(HRegionInfo hri) {
|
||||||
assignmentManager.assign(hri);
|
assignmentManager.assign(hri, true);
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
|
@ -580,12 +580,16 @@ public class ServerManager {
|
||||||
*/
|
*/
|
||||||
public boolean sendRegionClose(HServerInfo server, HRegionInfo region)
|
public boolean sendRegionClose(HServerInfo server, HRegionInfo region)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
if (server == null) return false;
|
if (server == null) {
|
||||||
|
LOG.debug("Unable to send region close because server is null; region=" +
|
||||||
|
region.getRegionNameAsString());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
HRegionInterface hri = getServerConnection(server);
|
HRegionInterface hri = getServerConnection(server);
|
||||||
if(hri == null) {
|
if(hri == null) {
|
||||||
LOG.warn("Attempting to send CLOSE RPC to server " +
|
LOG.warn("Attempting to send CLOSE RPC to server " +
|
||||||
server.getServerName() + " failed because no RPC connection found " +
|
server.getServerName() + " for region " + region.getRegionNameAsString()
|
||||||
"to this server");
|
+ " failed because no RPC connection found to this server");
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
return hri.closeRegion(region);
|
return hri.closeRegion(region);
|
||||||
|
|
|
@ -24,7 +24,6 @@ import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.hadoop.hbase.HRegionInfo;
|
import org.apache.hadoop.hbase.HRegionInfo;
|
||||||
import org.apache.hadoop.hbase.Server;
|
import org.apache.hadoop.hbase.Server;
|
||||||
import org.apache.hadoop.hbase.executor.EventHandler;
|
import org.apache.hadoop.hbase.executor.EventHandler;
|
||||||
import org.apache.hadoop.hbase.executor.RegionTransitionData;
|
|
||||||
import org.apache.hadoop.hbase.master.AssignmentManager;
|
import org.apache.hadoop.hbase.master.AssignmentManager;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -39,7 +38,6 @@ public class ClosedRegionHandler extends EventHandler implements TotesHRegionInf
|
||||||
private static final Log LOG = LogFactory.getLog(ClosedRegionHandler.class);
|
private static final Log LOG = LogFactory.getLog(ClosedRegionHandler.class);
|
||||||
|
|
||||||
private final AssignmentManager assignmentManager;
|
private final AssignmentManager assignmentManager;
|
||||||
private final RegionTransitionData data;
|
|
||||||
private final HRegionInfo regionInfo;
|
private final HRegionInfo regionInfo;
|
||||||
|
|
||||||
private final ClosedPriority priority;
|
private final ClosedPriority priority;
|
||||||
|
@ -58,12 +56,10 @@ public class ClosedRegionHandler extends EventHandler implements TotesHRegionInf
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
public ClosedRegionHandler(Server server,
|
public ClosedRegionHandler(Server server, AssignmentManager assignmentManager,
|
||||||
AssignmentManager assignmentManager, RegionTransitionData data,
|
|
||||||
HRegionInfo regionInfo) {
|
HRegionInfo regionInfo) {
|
||||||
super(server, EventType.RS_ZK_REGION_CLOSED);
|
super(server, EventType.RS_ZK_REGION_CLOSED);
|
||||||
this.assignmentManager = assignmentManager;
|
this.assignmentManager = assignmentManager;
|
||||||
this.data = data;
|
|
||||||
this.regionInfo = regionInfo;
|
this.regionInfo = regionInfo;
|
||||||
if(regionInfo.isRootRegion()) {
|
if(regionInfo.isRootRegion()) {
|
||||||
priority = ClosedPriority.ROOT;
|
priority = ClosedPriority.ROOT;
|
||||||
|
@ -94,6 +90,6 @@ public class ClosedRegionHandler extends EventHandler implements TotesHRegionInf
|
||||||
}
|
}
|
||||||
// ZK Node is in CLOSED state, assign it.
|
// ZK Node is in CLOSED state, assign it.
|
||||||
assignmentManager.setOffline(regionInfo);
|
assignmentManager.setOffline(regionInfo);
|
||||||
assignmentManager.assign(regionInfo);
|
assignmentManager.assign(regionInfo, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
|
@ -77,7 +77,7 @@ public class EnableTableHandler extends EventHandler {
|
||||||
assignmentManager.undisableTable(this.tableNameStr);
|
assignmentManager.undisableTable(this.tableNameStr);
|
||||||
// Verify all regions of table are disabled
|
// Verify all regions of table are disabled
|
||||||
for (HRegionInfo region : regions) {
|
for (HRegionInfo region : regions) {
|
||||||
assignmentManager.assign(region);
|
assignmentManager.assign(region, true);
|
||||||
}
|
}
|
||||||
// Wait on table's regions to clear region in transition.
|
// Wait on table's regions to clear region in transition.
|
||||||
for (HRegionInfo region: regions) {
|
for (HRegionInfo region: regions) {
|
||||||
|
|
|
@ -25,7 +25,6 @@ import org.apache.hadoop.hbase.HRegionInfo;
|
||||||
import org.apache.hadoop.hbase.HServerInfo;
|
import org.apache.hadoop.hbase.HServerInfo;
|
||||||
import org.apache.hadoop.hbase.Server;
|
import org.apache.hadoop.hbase.Server;
|
||||||
import org.apache.hadoop.hbase.executor.EventHandler;
|
import org.apache.hadoop.hbase.executor.EventHandler;
|
||||||
import org.apache.hadoop.hbase.executor.RegionTransitionData;
|
|
||||||
import org.apache.hadoop.hbase.master.AssignmentManager;
|
import org.apache.hadoop.hbase.master.AssignmentManager;
|
||||||
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
|
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
|
||||||
import org.apache.zookeeper.KeeperException;
|
import org.apache.zookeeper.KeeperException;
|
||||||
|
@ -55,8 +54,8 @@ public class OpenedRegionHandler extends EventHandler implements TotesHRegionInf
|
||||||
};
|
};
|
||||||
|
|
||||||
public OpenedRegionHandler(Server server,
|
public OpenedRegionHandler(Server server,
|
||||||
AssignmentManager assignmentManager, RegionTransitionData data,
|
AssignmentManager assignmentManager, HRegionInfo regionInfo,
|
||||||
HRegionInfo regionInfo, HServerInfo serverInfo) {
|
HServerInfo serverInfo) {
|
||||||
super(server, EventType.RS_ZK_REGION_OPENED);
|
super(server, EventType.RS_ZK_REGION_OPENED);
|
||||||
this.assignmentManager = assignmentManager;
|
this.assignmentManager = assignmentManager;
|
||||||
this.regionInfo = regionInfo;
|
this.regionInfo = regionInfo;
|
||||||
|
|
|
@ -20,6 +20,7 @@
|
||||||
package org.apache.hadoop.hbase.master.handler;
|
package org.apache.hadoop.hbase.master.handler;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.NavigableMap;
|
import java.util.NavigableMap;
|
||||||
|
|
||||||
|
@ -97,7 +98,8 @@ public class ServerShutdownHandler extends EventHandler {
|
||||||
// doing after log splitting. Could do some states before -- OPENING?
|
// doing after log splitting. Could do some states before -- OPENING?
|
||||||
// OFFLINE? -- and then others after like CLOSING that depend on log
|
// OFFLINE? -- and then others after like CLOSING that depend on log
|
||||||
// splitting.
|
// splitting.
|
||||||
this.services.getAssignmentManager().processServerShutdown(this.hsi);
|
List<HRegionInfo> regionsInTransition =
|
||||||
|
this.services.getAssignmentManager().processServerShutdown(this.hsi);
|
||||||
|
|
||||||
// Assign root and meta if we were carrying them.
|
// Assign root and meta if we were carrying them.
|
||||||
if (isCarryingRoot()) { // -ROOT-
|
if (isCarryingRoot()) { // -ROOT-
|
||||||
|
@ -113,41 +115,66 @@ public class ServerShutdownHandler extends EventHandler {
|
||||||
if (isCarryingMeta()) this.services.getAssignmentManager().assignMeta();
|
if (isCarryingMeta()) this.services.getAssignmentManager().assignMeta();
|
||||||
|
|
||||||
// Wait on meta to come online; we need it to progress.
|
// Wait on meta to come online; we need it to progress.
|
||||||
try {
|
// TODO: Best way to hold strictly here? We should build this retry logic
|
||||||
this.server.getCatalogTracker().waitForMeta();
|
// into the MetaReader operations themselves.
|
||||||
} catch (InterruptedException e) {
|
NavigableMap<HRegionInfo, Result> hris = null;
|
||||||
Thread.currentThread().interrupt();
|
while (!this.server.isStopped()) {
|
||||||
throw new IOException("Interrupted", e);
|
try {
|
||||||
|
this.server.getCatalogTracker().waitForMeta();
|
||||||
|
hris = MetaReader.getServerUserRegions(this.server.getCatalogTracker(),
|
||||||
|
this.hsi);
|
||||||
|
break;
|
||||||
|
} catch (InterruptedException e) {
|
||||||
|
Thread.currentThread().interrupt();
|
||||||
|
throw new IOException("Interrupted", e);
|
||||||
|
} catch (IOException ioe) {
|
||||||
|
LOG.info("Received exception accessing META during server shutdown of " +
|
||||||
|
serverName + ", retrying META read");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NavigableMap<HRegionInfo, Result> hris =
|
// Remove regions that were in transition
|
||||||
MetaReader.getServerUserRegions(this.server.getCatalogTracker(), this.hsi);
|
for (HRegionInfo rit : regionsInTransition) hris.remove(rit);
|
||||||
LOG.info("Reassigning the " + hris.size() + " region(s) that " + serverName +
|
LOG.info("Reassigning the " + hris.size() + " region(s) that " + serverName
|
||||||
" was carrying");
|
+ " was carrying (skipping " + regionsInTransition.size() +
|
||||||
|
" regions(s) that are in transition)");
|
||||||
|
|
||||||
// We should encounter -ROOT- and .META. first in the Set given how its
|
// Iterate regions that were on this server and assign them
|
||||||
// a sorted set.
|
|
||||||
for (Map.Entry<HRegionInfo, Result> e: hris.entrySet()) {
|
for (Map.Entry<HRegionInfo, Result> e: hris.entrySet()) {
|
||||||
processDeadRegion(e.getKey(), e.getValue(),
|
if (processDeadRegion(e.getKey(), e.getValue(),
|
||||||
this.services.getAssignmentManager(),
|
this.services.getAssignmentManager(),
|
||||||
this.server.getCatalogTracker());
|
this.server.getCatalogTracker())) {
|
||||||
this.services.getAssignmentManager().assign(e.getKey());
|
this.services.getAssignmentManager().assign(e.getKey(), true);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
this.deadServers.remove(serverName);
|
this.deadServers.remove(serverName);
|
||||||
LOG.info("Finished processing of shutdown of " + serverName);
|
LOG.info("Finished processing of shutdown of " + serverName);
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void processDeadRegion(HRegionInfo hri, Result result,
|
/**
|
||||||
|
* Process a dead region from a dead RS. Checks if the region is disabled
|
||||||
|
* or if the region has a partially completed split.
|
||||||
|
* <p>
|
||||||
|
* Returns true if specified region should be assigned, false if not.
|
||||||
|
* @param hri
|
||||||
|
* @param result
|
||||||
|
* @param assignmentManager
|
||||||
|
* @param catalogTracker
|
||||||
|
* @return
|
||||||
|
* @throws IOException
|
||||||
|
*/
|
||||||
|
public static boolean processDeadRegion(HRegionInfo hri, Result result,
|
||||||
AssignmentManager assignmentManager, CatalogTracker catalogTracker)
|
AssignmentManager assignmentManager, CatalogTracker catalogTracker)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
// If table is not disabled but the region is offlined,
|
// If table is not disabled but the region is offlined,
|
||||||
boolean disabled = assignmentManager.isTableDisabled(
|
boolean disabled = assignmentManager.isTableDisabled(
|
||||||
hri.getTableDesc().getNameAsString());
|
hri.getTableDesc().getNameAsString());
|
||||||
if (disabled) return;
|
if (disabled) return false;
|
||||||
if (hri.isOffline() && hri.isSplit()) {
|
if (hri.isOffline() && hri.isSplit()) {
|
||||||
fixupDaughters(result, assignmentManager, catalogTracker);
|
fixupDaughters(result, assignmentManager, catalogTracker);
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -183,7 +210,7 @@ public class ServerShutdownHandler extends EventHandler {
|
||||||
if (pair == null || pair.getFirst() == null) {
|
if (pair == null || pair.getFirst() == null) {
|
||||||
LOG.info("Fixup; missing daughter " + hri.getEncodedName());
|
LOG.info("Fixup; missing daughter " + hri.getEncodedName());
|
||||||
MetaEditor.addDaughter(catalogTracker, hri, null);
|
MetaEditor.addDaughter(catalogTracker, hri, null);
|
||||||
assignmentManager.assign(hri);
|
assignmentManager.assign(hri, true);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -741,6 +741,33 @@ public class ZKAssign {
|
||||||
return RegionTransitionData.fromBytes(data);
|
return RegionTransitionData.fromBytes(data);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets the current data in the unassigned node for the specified region name
|
||||||
|
* or fully-qualified path.
|
||||||
|
*
|
||||||
|
* <p>Returns null if the region does not currently have a node.
|
||||||
|
*
|
||||||
|
* <p>Does not set a watch.
|
||||||
|
*
|
||||||
|
* @param watcher zk reference
|
||||||
|
* @param pathOrRegionName fully-specified path or region name
|
||||||
|
* @param stat object to store node info into on getData call
|
||||||
|
* @return data for the unassigned node
|
||||||
|
* @throws KeeperException
|
||||||
|
* @throws KeeperException if unexpected zookeeper exception
|
||||||
|
*/
|
||||||
|
public static RegionTransitionData getDataNoWatch(ZooKeeperWatcher zkw,
|
||||||
|
String pathOrRegionName, Stat stat)
|
||||||
|
throws KeeperException {
|
||||||
|
String node = pathOrRegionName.startsWith("/") ?
|
||||||
|
pathOrRegionName : getNodeName(zkw, pathOrRegionName);
|
||||||
|
byte [] data = ZKUtil.getDataNoWatch(zkw, node, stat);
|
||||||
|
if(data == null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return RegionTransitionData.fromBytes(data);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Delete the assignment node regardless of its current state.
|
* Delete the assignment node regardless of its current state.
|
||||||
* <p>
|
* <p>
|
||||||
|
|
|
@ -339,17 +339,22 @@ public class TestMasterFailover {
|
||||||
* ZK = CLOSING
|
* ZK = CLOSING
|
||||||
*/
|
*/
|
||||||
|
|
||||||
// Region of enabled table being closed but not complete
|
// Disabled test of CLOSING. This case is invalid after HBASE-3181.
|
||||||
// Region is already assigned, don't say anything to RS but set ZK closing
|
// How can an RS stop a CLOSING w/o deleting the node? If it did ever fail
|
||||||
region = enabledAndAssignedRegions.remove(0);
|
// and left the node in CLOSING, the RS would have aborted and we'd process
|
||||||
regionsThatShouldBeOnline.add(region);
|
// these regions in server shutdown
|
||||||
ZKAssign.createNodeClosing(zkw, region, serverName);
|
//
|
||||||
|
// // Region of enabled table being closed but not complete
|
||||||
// Region of disabled table being closed but not complete
|
// // Region is already assigned, don't say anything to RS but set ZK closing
|
||||||
// Region is already assigned, don't say anything to RS but set ZK closing
|
// region = enabledAndAssignedRegions.remove(0);
|
||||||
region = disabledAndAssignedRegions.remove(0);
|
// regionsThatShouldBeOnline.add(region);
|
||||||
regionsThatShouldBeOffline.add(region);
|
// ZKAssign.createNodeClosing(zkw, region, serverName);
|
||||||
ZKAssign.createNodeClosing(zkw, region, serverName);
|
//
|
||||||
|
// // Region of disabled table being closed but not complete
|
||||||
|
// // Region is already assigned, don't say anything to RS but set ZK closing
|
||||||
|
// region = disabledAndAssignedRegions.remove(0);
|
||||||
|
// regionsThatShouldBeOffline.add(region);
|
||||||
|
// ZKAssign.createNodeClosing(zkw, region, serverName);
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* ZK = CLOSED
|
* ZK = CLOSED
|
||||||
|
@ -797,26 +802,32 @@ public class TestMasterFailover {
|
||||||
|
|
||||||
// Let's add some weird states to master in-memory state
|
// Let's add some weird states to master in-memory state
|
||||||
|
|
||||||
|
// After HBASE-3181, we need to have some ZK state if we're PENDING_OPEN
|
||||||
|
// b/c it is impossible for us to get into this state w/o a zk node
|
||||||
|
// this is not true of PENDING_CLOSE
|
||||||
|
|
||||||
// PENDING_OPEN and enabled
|
// PENDING_OPEN and enabled
|
||||||
region = enabledRegions.remove(0);
|
region = enabledRegions.remove(0);
|
||||||
regionsThatShouldBeOnline.add(region);
|
regionsThatShouldBeOnline.add(region);
|
||||||
master.assignmentManager.regionsInTransition.put(region.getEncodedName(),
|
master.assignmentManager.regionsInTransition.put(region.getEncodedName(),
|
||||||
new RegionState(region, RegionState.State.PENDING_OPEN));
|
new RegionState(region, RegionState.State.PENDING_OPEN, 0));
|
||||||
|
ZKAssign.createNodeOffline(zkw, region, master.getServerName());
|
||||||
// PENDING_OPEN and disabled
|
// PENDING_OPEN and disabled
|
||||||
region = disabledRegions.remove(0);
|
region = disabledRegions.remove(0);
|
||||||
regionsThatShouldBeOffline.add(region);
|
regionsThatShouldBeOffline.add(region);
|
||||||
master.assignmentManager.regionsInTransition.put(region.getEncodedName(),
|
master.assignmentManager.regionsInTransition.put(region.getEncodedName(),
|
||||||
new RegionState(region, RegionState.State.PENDING_OPEN));
|
new RegionState(region, RegionState.State.PENDING_OPEN, 0));
|
||||||
|
ZKAssign.createNodeOffline(zkw, region, master.getServerName());
|
||||||
// PENDING_CLOSE and enabled
|
// PENDING_CLOSE and enabled
|
||||||
region = enabledRegions.remove(0);
|
region = enabledRegions.remove(0);
|
||||||
regionsThatShouldBeOnline.add(region);
|
regionsThatShouldBeOnline.add(region);
|
||||||
master.assignmentManager.regionsInTransition.put(region.getEncodedName(),
|
master.assignmentManager.regionsInTransition.put(region.getEncodedName(),
|
||||||
new RegionState(region, RegionState.State.PENDING_CLOSE));
|
new RegionState(region, RegionState.State.PENDING_CLOSE, 0));
|
||||||
// PENDING_CLOSE and disabled
|
// PENDING_CLOSE and disabled
|
||||||
region = disabledRegions.remove(0);
|
region = disabledRegions.remove(0);
|
||||||
regionsThatShouldBeOffline.add(region);
|
regionsThatShouldBeOffline.add(region);
|
||||||
master.assignmentManager.regionsInTransition.put(region.getEncodedName(),
|
master.assignmentManager.regionsInTransition.put(region.getEncodedName(),
|
||||||
new RegionState(region, RegionState.State.PENDING_CLOSE));
|
new RegionState(region, RegionState.State.PENDING_CLOSE, 0));
|
||||||
|
|
||||||
// Failover should be completed, now wait for no RIT
|
// Failover should be completed, now wait for no RIT
|
||||||
log("Waiting for no more RIT");
|
log("Waiting for no more RIT");
|
||||||
|
|
|
@ -28,6 +28,7 @@ import java.util.TreeSet;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.hbase.HBaseConfiguration;
|
||||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||||
import org.apache.hadoop.hbase.HRegionInfo;
|
import org.apache.hadoop.hbase.HRegionInfo;
|
||||||
import org.apache.hadoop.hbase.MiniHBaseCluster;
|
import org.apache.hadoop.hbase.MiniHBaseCluster;
|
||||||
|
@ -37,6 +38,7 @@ import org.apache.hadoop.hbase.util.JVMClusterUtil.MasterThread;
|
||||||
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
|
import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
|
||||||
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
|
import org.apache.hadoop.hbase.zookeeper.ZKAssign;
|
||||||
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
|
import org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher;
|
||||||
|
import org.apache.zookeeper.KeeperException;
|
||||||
import org.junit.Test;
|
import org.junit.Test;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -51,38 +53,50 @@ public class TestRollingRestart {
|
||||||
// Start a cluster with 2 masters and 4 regionservers
|
// Start a cluster with 2 masters and 4 regionservers
|
||||||
final int NUM_MASTERS = 2;
|
final int NUM_MASTERS = 2;
|
||||||
final int NUM_RS = 3;
|
final int NUM_RS = 3;
|
||||||
final int NUM_REGIONS_TO_CREATE = 27;
|
final int NUM_REGIONS_TO_CREATE = 20;
|
||||||
|
|
||||||
int expectedNumRS = 3;
|
int expectedNumRS = 3;
|
||||||
|
|
||||||
// Start the cluster
|
// Start the cluster
|
||||||
HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
|
log("Starting cluster");
|
||||||
|
Configuration conf = HBaseConfiguration.create();
|
||||||
|
conf.setInt("hbase.master.assignment.timeoutmonitor.period", 2000);
|
||||||
|
conf.setInt("hbase.master.assignment.timeoutmonitor.timeout", 5000);
|
||||||
|
HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(conf);
|
||||||
TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
|
TEST_UTIL.startMiniCluster(NUM_MASTERS, NUM_RS);
|
||||||
MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
|
MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
|
||||||
|
log("Waiting for active/ready master");
|
||||||
cluster.waitForActiveAndReadyMaster();
|
cluster.waitForActiveAndReadyMaster();
|
||||||
Configuration conf = TEST_UTIL.getConfiguration();
|
|
||||||
ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "testRollingRestart",
|
ZooKeeperWatcher zkw = new ZooKeeperWatcher(conf, "testRollingRestart",
|
||||||
null);
|
null);
|
||||||
|
HMaster master = cluster.getMaster();
|
||||||
|
|
||||||
// Create a table with regions
|
// Create a table with regions
|
||||||
byte [] table = Bytes.toBytes("tableRestart");
|
byte [] table = Bytes.toBytes("tableRestart");
|
||||||
byte [] family = Bytes.toBytes("family");
|
byte [] family = Bytes.toBytes("family");
|
||||||
|
log("Creating table with " + NUM_REGIONS_TO_CREATE + " regions");
|
||||||
HTable ht = TEST_UTIL.createTable(table, family);
|
HTable ht = TEST_UTIL.createTable(table, family);
|
||||||
int numRegions = TEST_UTIL.createMultiRegions(conf, ht, family,
|
int numRegions = TEST_UTIL.createMultiRegions(conf, ht, family,
|
||||||
NUM_REGIONS_TO_CREATE);
|
NUM_REGIONS_TO_CREATE);
|
||||||
numRegions += 2; // catalogs
|
numRegions += 2; // catalogs
|
||||||
LOG.debug("\n\nWaiting for no more RIT\n");
|
log("Waiting for no more RIT\n");
|
||||||
ZKAssign.blockUntilNoRIT(zkw);
|
blockUntilNoRIT(zkw, master);
|
||||||
LOG.debug("\n\nDisabling table\n");
|
log("Disabling table\n");
|
||||||
TEST_UTIL.getHBaseAdmin().disableTable(table);
|
TEST_UTIL.getHBaseAdmin().disableTable(table);
|
||||||
LOG.debug("\n\nWaiting for no more RIT\n");
|
log("Waiting for no more RIT\n");
|
||||||
ZKAssign.blockUntilNoRIT(zkw);
|
blockUntilNoRIT(zkw, master);
|
||||||
LOG.debug("\n\nEnabling table\n");
|
|
||||||
TEST_UTIL.getHBaseAdmin().enableTable(table);
|
|
||||||
LOG.debug("\n\nWaiting for no more RIT\n");
|
|
||||||
ZKAssign.blockUntilNoRIT(zkw);
|
|
||||||
LOG.debug("\n\nVerifying there are " + numRegions + " assigned on cluster\n");
|
|
||||||
NavigableSet<String> regions = getAllOnlineRegions(cluster);
|
NavigableSet<String> regions = getAllOnlineRegions(cluster);
|
||||||
|
log("Verifying only catalog regions are assigned\n");
|
||||||
|
if (regions.size() != 2) {
|
||||||
|
for (String oregion : regions) log("Region still online: " + oregion);
|
||||||
|
}
|
||||||
|
assertEquals(2, regions.size());
|
||||||
|
log("Enabling table\n");
|
||||||
|
TEST_UTIL.getHBaseAdmin().enableTable(table);
|
||||||
|
log("Waiting for no more RIT\n");
|
||||||
|
blockUntilNoRIT(zkw, master);
|
||||||
|
log("Verifying there are " + numRegions + " assigned on cluster\n");
|
||||||
|
regions = getAllOnlineRegions(cluster);
|
||||||
assertRegionsAssigned(cluster, regions);
|
assertRegionsAssigned(cluster, regions);
|
||||||
assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
|
assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
|
||||||
|
|
||||||
|
@ -93,7 +107,7 @@ public class TestRollingRestart {
|
||||||
restarted.waitForServerOnline();
|
restarted.waitForServerOnline();
|
||||||
log("Additional RS is online");
|
log("Additional RS is online");
|
||||||
log("Waiting for no more RIT");
|
log("Waiting for no more RIT");
|
||||||
ZKAssign.blockUntilNoRIT(zkw);
|
blockUntilNoRIT(zkw, master);
|
||||||
log("Verifying there are " + numRegions + " assigned on cluster");
|
log("Verifying there are " + numRegions + " assigned on cluster");
|
||||||
assertRegionsAssigned(cluster, regions);
|
assertRegionsAssigned(cluster, regions);
|
||||||
assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
|
assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
|
||||||
|
@ -112,22 +126,23 @@ public class TestRollingRestart {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Bring down the backup master
|
// Bring down the backup master
|
||||||
LOG.debug("\n\nStopping backup master\n\n");
|
log("Stopping backup master\n\n");
|
||||||
backupMaster.getMaster().stop("Stop of backup during rolling restart");
|
backupMaster.getMaster().stop("Stop of backup during rolling restart");
|
||||||
cluster.hbaseCluster.waitOnMaster(backupMaster);
|
cluster.hbaseCluster.waitOnMaster(backupMaster);
|
||||||
|
|
||||||
// Bring down the primary master
|
// Bring down the primary master
|
||||||
LOG.debug("\n\nStopping primary master\n\n");
|
log("Stopping primary master\n\n");
|
||||||
activeMaster.getMaster().stop("Stop of active during rolling restart");
|
activeMaster.getMaster().stop("Stop of active during rolling restart");
|
||||||
cluster.hbaseCluster.waitOnMaster(activeMaster);
|
cluster.hbaseCluster.waitOnMaster(activeMaster);
|
||||||
|
|
||||||
// Start primary master
|
// Start primary master
|
||||||
LOG.debug("\n\nRestarting primary master\n\n");
|
log("Restarting primary master\n\n");
|
||||||
activeMaster = cluster.startMaster();
|
activeMaster = cluster.startMaster();
|
||||||
cluster.waitForActiveAndReadyMaster();
|
cluster.waitForActiveAndReadyMaster();
|
||||||
|
master = activeMaster.getMaster();
|
||||||
|
|
||||||
// Start backup master
|
// Start backup master
|
||||||
LOG.debug("\n\nRestarting backup master\n\n");
|
log("Restarting backup master\n\n");
|
||||||
backupMaster = cluster.startMaster();
|
backupMaster = cluster.startMaster();
|
||||||
|
|
||||||
assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
|
assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
|
||||||
|
@ -148,7 +163,7 @@ public class TestRollingRestart {
|
||||||
log("Waiting for RS shutdown to be handled by master");
|
log("Waiting for RS shutdown to be handled by master");
|
||||||
waitForRSShutdownToStartAndFinish(activeMaster, serverName);
|
waitForRSShutdownToStartAndFinish(activeMaster, serverName);
|
||||||
log("RS shutdown done, waiting for no more RIT");
|
log("RS shutdown done, waiting for no more RIT");
|
||||||
ZKAssign.blockUntilNoRIT(zkw);
|
blockUntilNoRIT(zkw, master);
|
||||||
log("Verifying there are " + numRegions + " assigned on cluster");
|
log("Verifying there are " + numRegions + " assigned on cluster");
|
||||||
assertRegionsAssigned(cluster, regions);
|
assertRegionsAssigned(cluster, regions);
|
||||||
expectedNumRS--;
|
expectedNumRS--;
|
||||||
|
@ -159,7 +174,7 @@ public class TestRollingRestart {
|
||||||
expectedNumRS++;
|
expectedNumRS++;
|
||||||
log("Region server " + num + " is back online");
|
log("Region server " + num + " is back online");
|
||||||
log("Waiting for no more RIT");
|
log("Waiting for no more RIT");
|
||||||
ZKAssign.blockUntilNoRIT(zkw);
|
blockUntilNoRIT(zkw, master);
|
||||||
log("Verifying there are " + numRegions + " assigned on cluster");
|
log("Verifying there are " + numRegions + " assigned on cluster");
|
||||||
assertRegionsAssigned(cluster, regions);
|
assertRegionsAssigned(cluster, regions);
|
||||||
assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
|
assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
|
||||||
|
@ -192,7 +207,7 @@ public class TestRollingRestart {
|
||||||
waitForRSShutdownToStartAndFinish(activeMaster,
|
waitForRSShutdownToStartAndFinish(activeMaster,
|
||||||
metaServer.getRegionServer().getServerName());
|
metaServer.getRegionServer().getServerName());
|
||||||
log("Waiting for no more RIT");
|
log("Waiting for no more RIT");
|
||||||
ZKAssign.blockUntilNoRIT(zkw);
|
blockUntilNoRIT(zkw, master);
|
||||||
log("Verifying there are " + numRegions + " assigned on cluster");
|
log("Verifying there are " + numRegions + " assigned on cluster");
|
||||||
assertRegionsAssigned(cluster, regions);
|
assertRegionsAssigned(cluster, regions);
|
||||||
assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
|
assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
|
||||||
|
@ -208,7 +223,7 @@ public class TestRollingRestart {
|
||||||
waitForRSShutdownToStartAndFinish(activeMaster,
|
waitForRSShutdownToStartAndFinish(activeMaster,
|
||||||
metaServer.getRegionServer().getServerName());
|
metaServer.getRegionServer().getServerName());
|
||||||
log("RS shutdown done, waiting for no more RIT");
|
log("RS shutdown done, waiting for no more RIT");
|
||||||
ZKAssign.blockUntilNoRIT(zkw);
|
blockUntilNoRIT(zkw, master);
|
||||||
log("Verifying there are " + numRegions + " assigned on cluster");
|
log("Verifying there are " + numRegions + " assigned on cluster");
|
||||||
assertRegionsAssigned(cluster, regions);
|
assertRegionsAssigned(cluster, regions);
|
||||||
assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
|
assertEquals(expectedNumRS, cluster.getRegionServerThreads().size());
|
||||||
|
@ -219,7 +234,7 @@ public class TestRollingRestart {
|
||||||
cluster.startRegionServer().waitForServerOnline();
|
cluster.startRegionServer().waitForServerOnline();
|
||||||
Thread.sleep(1000);
|
Thread.sleep(1000);
|
||||||
log("Waiting for no more RIT");
|
log("Waiting for no more RIT");
|
||||||
ZKAssign.blockUntilNoRIT(zkw);
|
blockUntilNoRIT(zkw, master);
|
||||||
log("Verifying there are " + numRegions + " assigned on cluster");
|
log("Verifying there are " + numRegions + " assigned on cluster");
|
||||||
assertRegionsAssigned(cluster, regions);
|
assertRegionsAssigned(cluster, regions);
|
||||||
// Shutdown server hosting META
|
// Shutdown server hosting META
|
||||||
|
@ -232,7 +247,7 @@ public class TestRollingRestart {
|
||||||
waitForRSShutdownToStartAndFinish(activeMaster,
|
waitForRSShutdownToStartAndFinish(activeMaster,
|
||||||
metaServer.getRegionServer().getServerName());
|
metaServer.getRegionServer().getServerName());
|
||||||
log("RS shutdown done, waiting for no more RIT");
|
log("RS shutdown done, waiting for no more RIT");
|
||||||
ZKAssign.blockUntilNoRIT(zkw);
|
blockUntilNoRIT(zkw, master);
|
||||||
log("Verifying there are " + numRegions + " assigned on cluster");
|
log("Verifying there are " + numRegions + " assigned on cluster");
|
||||||
assertRegionsAssigned(cluster, regions);
|
assertRegionsAssigned(cluster, regions);
|
||||||
|
|
||||||
|
@ -246,7 +261,7 @@ public class TestRollingRestart {
|
||||||
waitForRSShutdownToStartAndFinish(activeMaster,
|
waitForRSShutdownToStartAndFinish(activeMaster,
|
||||||
metaServer.getRegionServer().getServerName());
|
metaServer.getRegionServer().getServerName());
|
||||||
log("RS shutdown done, waiting for no more RIT");
|
log("RS shutdown done, waiting for no more RIT");
|
||||||
ZKAssign.blockUntilNoRIT(zkw);
|
blockUntilNoRIT(zkw, master);
|
||||||
log("Verifying there are " + numRegions + " assigned on cluster");
|
log("Verifying there are " + numRegions + " assigned on cluster");
|
||||||
assertRegionsAssigned(cluster, regions);
|
assertRegionsAssigned(cluster, regions);
|
||||||
|
|
||||||
|
@ -260,7 +275,7 @@ public class TestRollingRestart {
|
||||||
waitForRSShutdownToStartAndFinish(activeMaster,
|
waitForRSShutdownToStartAndFinish(activeMaster,
|
||||||
metaServer.getRegionServer().getServerName());
|
metaServer.getRegionServer().getServerName());
|
||||||
log("RS shutdown done, waiting for no more RIT");
|
log("RS shutdown done, waiting for no more RIT");
|
||||||
ZKAssign.blockUntilNoRIT(zkw);
|
blockUntilNoRIT(zkw, master);
|
||||||
log("Verifying there are " + numRegions + " assigned on cluster");
|
log("Verifying there are " + numRegions + " assigned on cluster");
|
||||||
assertRegionsAssigned(cluster, regions);
|
assertRegionsAssigned(cluster, regions);
|
||||||
|
|
||||||
|
@ -280,6 +295,12 @@ public class TestRollingRestart {
|
||||||
TEST_UTIL.shutdownMiniCluster();
|
TEST_UTIL.shutdownMiniCluster();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private void blockUntilNoRIT(ZooKeeperWatcher zkw, HMaster master)
|
||||||
|
throws KeeperException, InterruptedException {
|
||||||
|
ZKAssign.blockUntilNoRIT(zkw);
|
||||||
|
master.assignmentManager.waitUntilNoRegionsInTransition(60000);
|
||||||
|
}
|
||||||
|
|
||||||
private void waitForRSShutdownToStartAndFinish(MasterThread activeMaster,
|
private void waitForRSShutdownToStartAndFinish(MasterThread activeMaster,
|
||||||
String serverName) throws InterruptedException {
|
String serverName) throws InterruptedException {
|
||||||
ServerManager sm = activeMaster.getMaster().getServerManager();
|
ServerManager sm = activeMaster.getMaster().getServerManager();
|
||||||
|
@ -298,7 +319,7 @@ public class TestRollingRestart {
|
||||||
}
|
}
|
||||||
|
|
||||||
private void log(String msg) {
|
private void log(String msg) {
|
||||||
LOG.debug("\n\n" + msg + "\n");
|
LOG.debug("\n\nTRR: " + msg + "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
private RegionServerThread getServerHostingMeta(MiniHBaseCluster cluster) {
|
private RegionServerThread getServerHostingMeta(MiniHBaseCluster cluster) {
|
||||||
|
@ -325,16 +346,25 @@ public class TestRollingRestart {
|
||||||
for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
|
for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
|
||||||
numFound += rst.getRegionServer().getNumberOfOnlineRegions();
|
numFound += rst.getRegionServer().getNumberOfOnlineRegions();
|
||||||
}
|
}
|
||||||
if (expectedRegions.size() != numFound) {
|
if (expectedRegions.size() > numFound) {
|
||||||
LOG.debug("Expected to find " + expectedRegions.size() + " but only found"
|
log("Expected to find " + expectedRegions.size() + " but only found"
|
||||||
+ " " + numFound);
|
+ " " + numFound);
|
||||||
NavigableSet<String> foundRegions = getAllOnlineRegions(cluster);
|
NavigableSet<String> foundRegions = getAllOnlineRegions(cluster);
|
||||||
for (String region : expectedRegions) {
|
for (String region : expectedRegions) {
|
||||||
if (!foundRegions.contains(region)) {
|
if (!foundRegions.contains(region)) {
|
||||||
LOG.debug("Missing region: " + region);
|
log("Missing region: " + region);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
assertEquals(expectedRegions.size(), numFound);
|
assertEquals(expectedRegions.size(), numFound);
|
||||||
|
} else if (expectedRegions.size() < numFound) {
|
||||||
|
int doubled = numFound - expectedRegions.size();
|
||||||
|
log("Expected to find " + expectedRegions.size() + " but found"
|
||||||
|
+ " " + numFound + " (" + doubled + " double assignments?)");
|
||||||
|
NavigableSet<String> doubleRegions = getDoubleAssignedRegions(cluster);
|
||||||
|
for (String region : doubleRegions) {
|
||||||
|
log("Region is double assigned: " + region);
|
||||||
|
}
|
||||||
|
assertEquals(expectedRegions.size(), numFound);
|
||||||
} else {
|
} else {
|
||||||
log("Success! Found expected number of " + numFound + " regions");
|
log("Success! Found expected number of " + numFound + " regions");
|
||||||
}
|
}
|
||||||
|
@ -350,4 +380,18 @@ public class TestRollingRestart {
|
||||||
return online;
|
return online;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private NavigableSet<String> getDoubleAssignedRegions(
|
||||||
|
MiniHBaseCluster cluster) {
|
||||||
|
NavigableSet<String> online = new TreeSet<String>();
|
||||||
|
NavigableSet<String> doubled = new TreeSet<String>();
|
||||||
|
for (RegionServerThread rst : cluster.getLiveRegionServerThreads()) {
|
||||||
|
for (HRegionInfo region : rst.getRegionServer().getOnlineRegions()) {
|
||||||
|
if(!online.add(region.getRegionNameAsString())) {
|
||||||
|
doubled.add(region.getRegionNameAsString());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return doubled;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue