HBASE-9724 Failed region split is not handled correctly by AM

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1530826 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
Enis Soztutar 2013-10-09 23:02:14 +00:00
parent 066638e6cc
commit 341256ae5b
2 changed files with 73 additions and 2 deletions

View File

@ -146,7 +146,7 @@ public class AssignmentManager extends ZooKeeperListener {
* See below in {@link #assign()} and {@link #unassign()}. * See below in {@link #assign()} and {@link #unassign()}.
*/ */
private final int maximumAttempts; private final int maximumAttempts;
/** /**
* The sleep time for which the assignment will wait before retrying in case of hbase:meta assignment * The sleep time for which the assignment will wait before retrying in case of hbase:meta assignment
* failure due to lack of availability of region plan * failure due to lack of availability of region plan
@ -1324,7 +1324,15 @@ public class AssignmentManager extends ZooKeeperListener {
+ "but this table is disabled, triggering close of region"); + "but this table is disabled, triggering close of region");
unassign(regionInfo); unassign(regionInfo);
} }
} else if (rs.isSplitting()) {
LOG.debug("Ephemeral node deleted. Found in SPLITTING state. " + "Removing from RIT "
+ rs.getRegion());
// it can be either SPLIT fail, or RS dead.
regionStates.regionOnline(rs.getRegion(), rs.getServerName());
} }
// RS does not delete the znode in case SPLIT, it only means RS died which
// will be handled by SSH
// in region merge we do not put merging regions to MERGING state
} finally { } finally {
lock.unlock(); lock.unlock();
} }

View File

@ -37,7 +37,7 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Abortable; import org.apache.hadoop.hbase.Abortable;
import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.Coprocessor;
import org.apache.hadoop.hbase.HBaseIOException; import org.apache.hadoop.hbase.HBaseIOException;
import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.HBaseTestingUtility;
import org.apache.hadoop.hbase.HColumnDescriptor; import org.apache.hadoop.hbase.HColumnDescriptor;
@ -50,7 +50,9 @@ import org.apache.hadoop.hbase.MiniHBaseCluster;
import org.apache.hadoop.hbase.RegionTransition; import org.apache.hadoop.hbase.RegionTransition;
import org.apache.hadoop.hbase.Server; import org.apache.hadoop.hbase.Server;
import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.ServerName;
import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.UnknownRegionException; import org.apache.hadoop.hbase.UnknownRegionException;
import org.apache.hadoop.hbase.Waiter;
import org.apache.hadoop.hbase.ZooKeeperConnectionException; import org.apache.hadoop.hbase.ZooKeeperConnectionException;
import org.apache.hadoop.hbase.catalog.MetaEditor; import org.apache.hadoop.hbase.catalog.MetaEditor;
import org.apache.hadoop.hbase.catalog.MetaReader; import org.apache.hadoop.hbase.catalog.MetaReader;
@ -192,6 +194,7 @@ public class TestSplitTransactionOnCluster {
assertTrue("not able to find a splittable region", region != null); assertTrue("not able to find a splittable region", region != null);
new Thread() { new Thread() {
@Override
public void run() { public void run() {
SplitTransaction st = null; SplitTransaction st = null;
st = new MockedSplitTransaction(region, Bytes.toBytes("row2")); st = new MockedSplitTransaction(region, Bytes.toBytes("row2"));
@ -250,6 +253,65 @@ public class TestSplitTransactionOnCluster {
} }
} }
@Test(timeout = 60000)
public void testRITStateForRollback() throws Exception {
final TableName tableName =
TableName.valueOf("testRITStateForRollback");
try {
// Create table then get the single region for our new table.
HTable t = createTableAndWait(tableName.getName(), Bytes.toBytes("cf"));
final List<HRegion> regions = cluster.getRegions(tableName);
final HRegionInfo hri = getAndCheckSingleTableRegion(regions);
int regionServerIndex = cluster.getServerWith(regions.get(0).getRegionName());
final HRegionServer regionServer = cluster.getRegionServer(regionServerIndex);
insertData(tableName.getName(), admin, t);
t.close();
// Turn off balancer so it doesn't cut in and mess up our placements.
this.admin.setBalancerRunning(false, true);
// Turn off the meta scanner so it don't remove parent on us.
cluster.getMaster().setCatalogJanitorEnabled(false);
// find a splittable region
final HRegion region = findSplittableRegion(regions);
assertTrue("not able to find a splittable region", region != null);
// install region co-processor to fail splits
region.getCoprocessorHost().load(FailingSplitRegionObserver.class,
Coprocessor.PRIORITY_USER, region.getBaseConf());
// split async
this.admin.split(region.getRegionName(), new byte[] {42});
// we have to wait until the SPLITTING state is seen by the master
FailingSplitRegionObserver.latch.await();
LOG.info("Waiting for region to come out of RIT");
TESTING_UTIL.waitFor(60000, 1000, new Waiter.Predicate<Exception>() {
@Override
public boolean evaluate() throws Exception {
RegionStates regionStates = cluster.getMaster().getAssignmentManager().getRegionStates();
Map<String, RegionState> rit = regionStates.getRegionsInTransition();
return !rit.containsKey(hri.getEncodedName());
}
});
} finally {
admin.setBalancerRunning(true, false);
cluster.getMaster().setCatalogJanitorEnabled(true);
TESTING_UTIL.deleteTable(tableName);
}
}
public static class FailingSplitRegionObserver extends BaseRegionObserver {
static volatile CountDownLatch latch = new CountDownLatch(1);
@Override
public void preSplitBeforePONR(ObserverContext<RegionCoprocessorEnvironment> ctx,
byte[] splitKey, List<Mutation> metaEntries) throws IOException {
latch.countDown();
throw new IOException("Causing rollback of region split");
}
}
/** /**
* A test that intentionally has master fail the processing of the split message. * A test that intentionally has master fail the processing of the split message.
* Tests that the regionserver split ephemeral node gets cleaned up if it * Tests that the regionserver split ephemeral node gets cleaned up if it
@ -1192,6 +1254,7 @@ public class TestSplitTransactionOnCluster {
super(conf); super(conf);
} }
@Override
protected void startCatalogJanitorChore() { protected void startCatalogJanitorChore() {
LOG.debug("Customised master executed."); LOG.debug("Customised master executed.");
} }