HBASE-12791 HBase does not attempt to clean up an aborted split when the regionserver shutting down(Rajeshbabu)
This commit is contained in:
parent
f6a017ce63
commit
5b850caa80
|
@ -31,6 +31,7 @@ import java.util.TreeMap;
|
|||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Preconditions;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.hbase.classification.InterfaceAudience;
|
||||
|
@ -46,6 +47,7 @@ import org.apache.hadoop.hbase.client.RegionReplicaUtil;
|
|||
import org.apache.hadoop.hbase.master.RegionState.State;
|
||||
import org.apache.hadoop.hbase.client.TableState;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.util.FSUtils;
|
||||
import org.apache.hadoop.hbase.util.Pair;
|
||||
|
||||
/**
|
||||
|
@ -568,10 +570,6 @@ public class RegionStates {
|
|||
}
|
||||
}
|
||||
|
||||
for (HRegionInfo hri : regionsToOffline) {
|
||||
regionOffline(hri);
|
||||
}
|
||||
|
||||
for (RegionState state : regionsInTransition.values()) {
|
||||
HRegionInfo hri = state.getRegion();
|
||||
if (assignedRegions.contains(hri)) {
|
||||
|
@ -591,12 +589,27 @@ public class RegionStates {
|
|||
State.FAILED_OPEN, State.FAILED_CLOSE, State.OFFLINE)) {
|
||||
LOG.info("Found region in " + state + " to be reassigned by SSH for " + sn);
|
||||
rits.add(hri);
|
||||
} else if (isOneOfStates(state, State.SPLITTING_NEW)) {
|
||||
try {
|
||||
if (MetaTableAccessor.getRegion(server.getConnection(), state.getRegion()
|
||||
.getEncodedNameAsBytes()) == null) {
|
||||
regionsToOffline.add(state.getRegion());
|
||||
FSUtils.deleteRegionDir(server.getConfiguration(), state.getRegion());
|
||||
}
|
||||
} catch (IOException e) {
|
||||
LOG.warn("Got exception while deleting " + state.getRegion()
|
||||
+ " directories from file system.", e);
|
||||
}
|
||||
} else {
|
||||
LOG.warn("THIS SHOULD NOT HAPPEN: unexpected " + state);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (HRegionInfo hri : regionsToOffline) {
|
||||
regionOffline(hri);
|
||||
}
|
||||
|
||||
this.notifyAll();
|
||||
return rits;
|
||||
}
|
||||
|
|
|
@ -182,6 +182,21 @@ public abstract class FSUtils {
|
|||
return fs.exists(dir) && fs.delete(dir, true);
|
||||
}
|
||||
|
||||
/**
|
||||
* Delete the region directory if exists.
|
||||
* @param conf
|
||||
* @param hri
|
||||
* @return True if deleted the region directory.
|
||||
* @throws IOException
|
||||
*/
|
||||
public static boolean deleteRegionDir(final Configuration conf, final HRegionInfo hri)
|
||||
throws IOException {
|
||||
Path rootDir = getRootDir(conf);
|
||||
FileSystem fs = rootDir.getFileSystem(conf);
|
||||
return deleteDirectory(fs,
|
||||
new Path(getTableDir(rootDir, hri.getTable()), hri.getEncodedName()));
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of bytes that large input files should be optimally
|
||||
* be split into to minimize i/o time.
|
||||
|
|
|
@ -53,7 +53,6 @@ import java.util.concurrent.TimeoutException;
|
|||
import java.util.concurrent.atomic.AtomicBoolean;
|
||||
import java.util.concurrent.atomic.AtomicInteger;
|
||||
|
||||
|
||||
import org.apache.commons.lang.StringUtils;
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
|
@ -85,8 +84,6 @@ import org.apache.hadoop.hbase.ServerName;
|
|||
import org.apache.hadoop.hbase.TableDescriptor;
|
||||
import org.apache.hadoop.hbase.TableName;
|
||||
import org.apache.hadoop.hbase.ZooKeeperConnectionException;
|
||||
import org.apache.hadoop.hbase.classification.InterfaceAudience;
|
||||
import org.apache.hadoop.hbase.classification.InterfaceStability;
|
||||
import org.apache.hadoop.hbase.client.Admin;
|
||||
import org.apache.hadoop.hbase.client.ClusterConnection;
|
||||
import org.apache.hadoop.hbase.client.ConnectionFactory;
|
||||
|
@ -134,7 +131,6 @@ import org.apache.hadoop.util.Tool;
|
|||
import org.apache.hadoop.util.ToolRunner;
|
||||
import org.apache.zookeeper.KeeperException;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import com.google.common.base.Joiner;
|
||||
import com.google.common.base.Preconditions;
|
||||
import com.google.common.collect.Lists;
|
||||
|
@ -1996,6 +1992,43 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
return;
|
||||
}
|
||||
|
||||
HRegionInfo hri = hbi.getHdfsHRI();
|
||||
TableInfo tableInfo = tablesInfo.get(hri.getTable());
|
||||
if (tableInfo.regionsFromMeta.isEmpty()) {
|
||||
for (HbckInfo h : regionInfoMap.values()) {
|
||||
if (h.getTableName().equals(hri.getTable())) {
|
||||
if (h.metaEntry != null) tableInfo.regionsFromMeta
|
||||
.add((HRegionInfo) h.metaEntry);
|
||||
}
|
||||
}
|
||||
Collections.sort(tableInfo.regionsFromMeta);
|
||||
}
|
||||
for (HRegionInfo region : tableInfo.regionsFromMeta) {
|
||||
if (Bytes.compareTo(region.getStartKey(), hri.getStartKey()) <= 0
|
||||
&& (region.getEndKey().length == 0 || Bytes.compareTo(region.getEndKey(),
|
||||
hri.getEndKey()) >= 0)
|
||||
&& Bytes.compareTo(region.getStartKey(), hri.getEndKey()) <= 0) {
|
||||
if(region.isSplit() || region.isOffline()) continue;
|
||||
Path regionDir = hbi.getHdfsRegionDir();
|
||||
FileSystem fs = regionDir.getFileSystem(getConf());
|
||||
List<Path> familyDirs = FSUtils.getFamilyDirs(fs, regionDir);
|
||||
for (Path familyDir : familyDirs) {
|
||||
List<Path> referenceFilePaths = FSUtils.getReferenceFilePaths(fs, familyDir);
|
||||
for (Path referenceFilePath : referenceFilePaths) {
|
||||
Path parentRegionDir =
|
||||
StoreFileInfo.getReferredToFile(referenceFilePath).getParent().getParent();
|
||||
if (parentRegionDir.toString().endsWith(region.getEncodedName())) {
|
||||
LOG.warn(hri + " start and stop keys are in the range of " + region
|
||||
+ ". The region might not be cleaned up from hdfs when region " + region
|
||||
+ " split failed. Hence deleting from hdfs.");
|
||||
HRegionFileSystem.deleteRegionFromFileSystem(getConf(), fs,
|
||||
regionDir.getParent(), hri);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
LOG.info("Patching hbase:meta with .regioninfo: " + hbi.getHdfsHRI());
|
||||
int numReplicas = admin.getTableDescriptor(hbi.getTableName()).getRegionReplication();
|
||||
HBaseFsckRepair.fixMetaHoleOnlineAndAddReplicas(getConf(), hbi.getHdfsHRI(),
|
||||
|
@ -2325,6 +2358,9 @@ public class HBaseFsck extends Configured implements Closeable {
|
|||
final Multimap<byte[], HbckInfo> overlapGroups =
|
||||
TreeMultimap.create(RegionSplitCalculator.BYTES_COMPARATOR, cmp);
|
||||
|
||||
// list of regions derived from meta entries.
|
||||
final List<HRegionInfo> regionsFromMeta = new ArrayList<HRegionInfo>();
|
||||
|
||||
TableInfo(TableName name) {
|
||||
this.tableName = name;
|
||||
deployedOn = new TreeSet <ServerName>();
|
||||
|
|
|
@ -950,6 +950,46 @@ public class TestSplitTransactionOnCluster {
|
|||
}
|
||||
}
|
||||
|
||||
@Test (timeout=300000)
|
||||
public void testSSHCleanupDaugtherRegionsOfAbortedSplit() throws Exception {
|
||||
TableName table = TableName.valueOf("testSSHCleanupDaugtherRegionsOfAbortedSplit");
|
||||
try {
|
||||
HTableDescriptor desc = new HTableDescriptor(table);
|
||||
desc.addFamily(new HColumnDescriptor(Bytes.toBytes("f")));
|
||||
admin.createTable(desc);
|
||||
HTable hTable = new HTable(cluster.getConfiguration(), desc.getTableName());
|
||||
for(int i = 1; i < 5; i++) {
|
||||
Put p1 = new Put(("r"+i).getBytes());
|
||||
p1.add(Bytes.toBytes("f"), "q1".getBytes(), "v".getBytes());
|
||||
hTable.put(p1);
|
||||
}
|
||||
admin.flush(desc.getTableName());
|
||||
List<HRegion> regions = cluster.getRegions(desc.getTableName());
|
||||
int serverWith = cluster.getServerWith(regions.get(0).getRegionName());
|
||||
HRegionServer regionServer = cluster.getRegionServer(serverWith);
|
||||
cluster.getServerWith(regions.get(0).getRegionName());
|
||||
SplitTransaction st = new SplitTransaction(regions.get(0), Bytes.toBytes("r3"));
|
||||
st.prepare();
|
||||
st.stepsBeforePONR(regionServer, regionServer, false);
|
||||
Path tableDir =
|
||||
FSUtils.getTableDir(cluster.getMaster().getMasterFileSystem().getRootDir(),
|
||||
desc.getTableName());
|
||||
tableDir.getFileSystem(cluster.getConfiguration());
|
||||
List<Path> regionDirs =
|
||||
FSUtils.getRegionDirs(tableDir.getFileSystem(cluster.getConfiguration()), tableDir);
|
||||
assertEquals(3,regionDirs.size());
|
||||
AssignmentManager am = cluster.getMaster().getAssignmentManager();
|
||||
am.processServerShutdown(regionServer.getServerName());
|
||||
assertEquals(am.getRegionStates().getRegionsInTransition().toString(), 0, am
|
||||
.getRegionStates().getRegionsInTransition().size());
|
||||
regionDirs =
|
||||
FSUtils.getRegionDirs(tableDir.getFileSystem(cluster.getConfiguration()), tableDir);
|
||||
assertEquals(1,regionDirs.size());
|
||||
} finally {
|
||||
TESTING_UTIL.deleteTable(table);
|
||||
}
|
||||
}
|
||||
|
||||
private void testSplitBeforeSettingSplittingInZKInternals() throws Exception {
|
||||
final TableName tableName = TableName.valueOf("testSplitBeforeSettingSplittingInZK");
|
||||
try {
|
||||
|
|
|
@ -88,6 +88,7 @@ import org.apache.hadoop.hbase.client.Table;
|
|||
import org.apache.hadoop.hbase.io.hfile.TestHFile;
|
||||
import org.apache.hadoop.hbase.master.AssignmentManager;
|
||||
import org.apache.hadoop.hbase.master.HMaster;
|
||||
import org.apache.hadoop.hbase.master.RegionState;
|
||||
import org.apache.hadoop.hbase.master.RegionStates;
|
||||
import org.apache.hadoop.hbase.master.TableLockManager;
|
||||
import org.apache.hadoop.hbase.master.TableLockManager.TableLock;
|
||||
|
@ -96,6 +97,7 @@ import org.apache.hadoop.hbase.protobuf.generated.AdminProtos;
|
|||
import org.apache.hadoop.hbase.regionserver.HRegion;
|
||||
import org.apache.hadoop.hbase.regionserver.HRegionFileSystem;
|
||||
import org.apache.hadoop.hbase.regionserver.HRegionServer;
|
||||
import org.apache.hadoop.hbase.regionserver.SplitTransaction;
|
||||
import org.apache.hadoop.hbase.regionserver.TestEndToEndSplitTransaction;
|
||||
import org.apache.hadoop.hbase.testclassification.LargeTests;
|
||||
import org.apache.hadoop.hbase.testclassification.MiscTests;
|
||||
|
@ -1173,6 +1175,61 @@ public class TestHBaseFsck {
|
|||
}
|
||||
}
|
||||
|
||||
@Test (timeout=180000)
|
||||
public void testCleanUpDaughtersNotInMetaAfterFailedSplit() throws Exception {
|
||||
TableName table = TableName.valueOf("testCleanUpDaughtersNotInMetaAfterFailedSplit");
|
||||
MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster();
|
||||
try {
|
||||
HTableDescriptor desc = new HTableDescriptor(table);
|
||||
desc.addFamily(new HColumnDescriptor(Bytes.toBytes("f")));
|
||||
admin.createTable(desc);
|
||||
tbl = new HTable(cluster.getConfiguration(), desc.getTableName());
|
||||
for (int i = 0; i < 5; i++) {
|
||||
Put p1 = new Put(("r" + i).getBytes());
|
||||
p1.add(Bytes.toBytes("f"), "q1".getBytes(), "v".getBytes());
|
||||
tbl.put(p1);
|
||||
}
|
||||
admin.flush(desc.getTableName());
|
||||
List<HRegion> regions = cluster.getRegions(desc.getTableName());
|
||||
int serverWith = cluster.getServerWith(regions.get(0).getRegionName());
|
||||
HRegionServer regionServer = cluster.getRegionServer(serverWith);
|
||||
cluster.getServerWith(regions.get(0).getRegionName());
|
||||
SplitTransaction st = new SplitTransaction(regions.get(0), Bytes.toBytes("r3"));
|
||||
st.prepare();
|
||||
st.stepsBeforePONR(regionServer, regionServer, false);
|
||||
AssignmentManager am = cluster.getMaster().getAssignmentManager();
|
||||
Map<String, RegionState> regionsInTransition = am.getRegionStates().getRegionsInTransition();
|
||||
for (RegionState state : regionsInTransition.values()) {
|
||||
am.regionOffline(state.getRegion());
|
||||
}
|
||||
Map<HRegionInfo, ServerName> regionsMap = new HashMap<HRegionInfo, ServerName>();
|
||||
regionsMap.put(regions.get(0).getRegionInfo(), regionServer.getServerName());
|
||||
am.assign(regionsMap);
|
||||
am.waitForAssignment(regions.get(0).getRegionInfo());
|
||||
HBaseFsck hbck = doFsck(conf, false);
|
||||
assertErrors(hbck, new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
|
||||
ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
|
||||
// holes are separate from overlap groups
|
||||
assertEquals(0, hbck.getOverlapGroups(table).size());
|
||||
|
||||
// fix hole
|
||||
assertErrors(
|
||||
doFsck(conf, false, true, false, false, false, false, false, false, false, false, null),
|
||||
new ERROR_CODE[] { ERROR_CODE.NOT_IN_META_OR_DEPLOYED,
|
||||
ERROR_CODE.NOT_IN_META_OR_DEPLOYED });
|
||||
|
||||
// check that hole fixed
|
||||
assertNoErrors(doFsck(conf, false));
|
||||
assertEquals(5, countRows());
|
||||
} finally {
|
||||
if (tbl != null) {
|
||||
tbl.close();
|
||||
tbl = null;
|
||||
}
|
||||
cleanupTable(table);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* This creates fixes a bad table with a hole in meta.
|
||||
*/
|
||||
|
|
Loading…
Reference in New Issue