HBASE-27579 CatalogJanitor can cause data loss due to errors during cleanMergeRegion (#4986)
Signed-off-by: Andrew Purtell <apurtell@apache.org> Signed-off-by: Viraj Jasani <vjasani@apache.org> Signed-off-by: Duo Zhang <zhangduo@apache.org>
This commit is contained in:
parent
8fc3ef3eb2
commit
c6a17220e0
@ -34,7 +34,6 @@ import org.apache.hadoop.hbase.HConstants;
|
|||||||
import org.apache.hadoop.hbase.MetaTableAccessor;
|
import org.apache.hadoop.hbase.MetaTableAccessor;
|
||||||
import org.apache.hadoop.hbase.ScheduledChore;
|
import org.apache.hadoop.hbase.ScheduledChore;
|
||||||
import org.apache.hadoop.hbase.TableName;
|
import org.apache.hadoop.hbase.TableName;
|
||||||
import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor;
|
|
||||||
import org.apache.hadoop.hbase.client.Connection;
|
import org.apache.hadoop.hbase.client.Connection;
|
||||||
import org.apache.hadoop.hbase.client.ConnectionFactory;
|
import org.apache.hadoop.hbase.client.ConnectionFactory;
|
||||||
import org.apache.hadoop.hbase.client.Get;
|
import org.apache.hadoop.hbase.client.Get;
|
||||||
@ -183,12 +182,12 @@ public class CatalogJanitor extends ScheduledChore {
|
|||||||
for (Map.Entry<RegionInfo, Result> e : mergedRegions.entrySet()) {
|
for (Map.Entry<RegionInfo, Result> e : mergedRegions.entrySet()) {
|
||||||
if (this.services.isInMaintenanceMode()) {
|
if (this.services.isInMaintenanceMode()) {
|
||||||
// Stop cleaning if the master is in maintenance mode
|
// Stop cleaning if the master is in maintenance mode
|
||||||
LOG.debug("In maintenence mode, not cleaning");
|
LOG.debug("In maintenance mode, not cleaning");
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
List<RegionInfo> parents = MetaTableAccessor.getMergeRegions(e.getValue().rawCells());
|
List<RegionInfo> parents = MetaTableAccessor.getMergeRegions(e.getValue().rawCells());
|
||||||
if (parents != null && cleanMergeRegion(e.getKey(), parents)) {
|
if (parents != null && cleanMergeRegion(this.services, e.getKey(), parents)) {
|
||||||
gcs++;
|
gcs++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -201,7 +200,7 @@ public class CatalogJanitor extends ScheduledChore {
|
|||||||
if (this.services.isInMaintenanceMode()) {
|
if (this.services.isInMaintenanceMode()) {
|
||||||
// Stop cleaning if the master is in maintenance mode
|
// Stop cleaning if the master is in maintenance mode
|
||||||
if (LOG.isDebugEnabled()) {
|
if (LOG.isDebugEnabled()) {
|
||||||
LOG.debug("In maintenence mode, not cleaning");
|
LOG.debug("In maintenance mode, not cleaning");
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@ -248,30 +247,24 @@ public class CatalogJanitor extends ScheduledChore {
|
|||||||
* @return true if we delete references in merged region on hbase:meta and archive the files on
|
* @return true if we delete references in merged region on hbase:meta and archive the files on
|
||||||
* the file system
|
* the file system
|
||||||
*/
|
*/
|
||||||
private boolean cleanMergeRegion(final RegionInfo mergedRegion, List<RegionInfo> parents)
|
static boolean cleanMergeRegion(MasterServices services, final RegionInfo mergedRegion,
|
||||||
throws IOException {
|
List<RegionInfo> parents) throws IOException {
|
||||||
if (LOG.isDebugEnabled()) {
|
if (LOG.isDebugEnabled()) {
|
||||||
LOG.debug("Cleaning merged region {}", mergedRegion);
|
LOG.debug("Cleaning merged region {}", mergedRegion);
|
||||||
}
|
}
|
||||||
FileSystem fs = this.services.getMasterFileSystem().getFileSystem();
|
|
||||||
Path rootdir = this.services.getMasterFileSystem().getRootDir();
|
Pair<Boolean, Boolean> result =
|
||||||
Path tabledir = CommonFSUtils.getTableDir(rootdir, mergedRegion.getTable());
|
checkRegionReferences(services, mergedRegion.getTable(), mergedRegion);
|
||||||
TableDescriptor htd = getDescriptor(mergedRegion.getTable());
|
|
||||||
HRegionFileSystem regionFs = null;
|
if (hasNoReferences(result)) {
|
||||||
try {
|
|
||||||
regionFs = HRegionFileSystem.openRegionFromFileSystem(this.services.getConfiguration(), fs,
|
|
||||||
tabledir, mergedRegion, true);
|
|
||||||
} catch (IOException e) {
|
|
||||||
LOG.warn("Merged region does not exist: " + mergedRegion.getEncodedName());
|
|
||||||
}
|
|
||||||
if (regionFs == null || !regionFs.hasReferences(htd)) {
|
|
||||||
if (LOG.isDebugEnabled()) {
|
if (LOG.isDebugEnabled()) {
|
||||||
LOG.debug(
|
LOG.debug(
|
||||||
"Deleting parents ({}) from fs; merged child {} no longer holds references", parents
|
"Deleting parents ({}) from fs; merged child {} no longer holds references", parents
|
||||||
.stream().map(r -> RegionInfo.getShortNameToLog(r)).collect(Collectors.joining(", ")),
|
.stream().map(r -> RegionInfo.getShortNameToLog(r)).collect(Collectors.joining(", ")),
|
||||||
mergedRegion);
|
mergedRegion);
|
||||||
}
|
}
|
||||||
ProcedureExecutor<MasterProcedureEnv> pe = this.services.getMasterProcedureExecutor();
|
|
||||||
|
ProcedureExecutor<MasterProcedureEnv> pe = services.getMasterProcedureExecutor();
|
||||||
GCMultipleMergedRegionsProcedure mergeRegionProcedure =
|
GCMultipleMergedRegionsProcedure mergeRegionProcedure =
|
||||||
new GCMultipleMergedRegionsProcedure(pe.getEnvironment(), mergedRegion, parents);
|
new GCMultipleMergedRegionsProcedure(pe.getEnvironment(), mergedRegion, parents);
|
||||||
pe.submitProcedure(mergeRegionProcedure);
|
pe.submitProcedure(mergeRegionProcedure);
|
||||||
@ -280,7 +273,15 @@ public class CatalogJanitor extends ScheduledChore {
|
|||||||
mergedRegion);
|
mergedRegion);
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
|
} else {
|
||||||
|
if (LOG.isDebugEnabled()) {
|
||||||
|
LOG.debug(
|
||||||
|
"Deferring cleanup up of {} parents of merged region {}, because references "
|
||||||
|
+ "still exist in merged region or we encountered an exception in checking",
|
||||||
|
parents.size(), mergedRegion.getEncodedName());
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -332,8 +333,10 @@ public class CatalogJanitor extends ScheduledChore {
|
|||||||
}
|
}
|
||||||
// Run checks on each daughter split.
|
// Run checks on each daughter split.
|
||||||
PairOfSameType<RegionInfo> daughters = MetaTableAccessor.getDaughterRegions(rowContent);
|
PairOfSameType<RegionInfo> daughters = MetaTableAccessor.getDaughterRegions(rowContent);
|
||||||
Pair<Boolean, Boolean> a = checkDaughterInFs(services, parent, daughters.getFirst());
|
Pair<Boolean, Boolean> a =
|
||||||
Pair<Boolean, Boolean> b = checkDaughterInFs(services, parent, daughters.getSecond());
|
checkRegionReferences(services, parent.getTable(), daughters.getFirst());
|
||||||
|
Pair<Boolean, Boolean> b =
|
||||||
|
checkRegionReferences(services, parent.getTable(), daughters.getSecond());
|
||||||
if (hasNoReferences(a) && hasNoReferences(b)) {
|
if (hasNoReferences(a) && hasNoReferences(b)) {
|
||||||
String daughterA =
|
String daughterA =
|
||||||
daughters.getFirst() != null ? daughters.getFirst().getShortNameToLog() : "null";
|
daughters.getFirst() != null ? daughters.getFirst().getShortNameToLog() : "null";
|
||||||
@ -386,59 +389,45 @@ public class CatalogJanitor extends ScheduledChore {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Checks if a daughter region -- either splitA or splitB -- still holds references to parent.
|
* Checks if a region still holds references to parent.
|
||||||
* @param parent Parent region
|
* @param tableName The table for the region
|
||||||
* @param daughter Daughter region
|
* @param region The region to check
|
||||||
* @return A pair where the first boolean says whether or not the daughter region directory exists
|
* @return A pair where the first boolean says whether the region directory exists in the
|
||||||
* in the filesystem and then the second boolean says whether the daughter has references
|
* filesystem and then the second boolean says whether the region has references to a
|
||||||
* to the parent.
|
* parent.
|
||||||
*/
|
*/
|
||||||
private static Pair<Boolean, Boolean> checkDaughterInFs(MasterServices services,
|
private static Pair<Boolean, Boolean> checkRegionReferences(MasterServices services,
|
||||||
final RegionInfo parent, final RegionInfo daughter) throws IOException {
|
TableName tableName, RegionInfo region) throws IOException {
|
||||||
if (daughter == null) {
|
if (region == null) {
|
||||||
return new Pair<>(Boolean.FALSE, Boolean.FALSE);
|
return new Pair<>(Boolean.FALSE, Boolean.FALSE);
|
||||||
}
|
}
|
||||||
|
|
||||||
FileSystem fs = services.getMasterFileSystem().getFileSystem();
|
FileSystem fs = services.getMasterFileSystem().getFileSystem();
|
||||||
Path rootdir = services.getMasterFileSystem().getRootDir();
|
Path rootdir = services.getMasterFileSystem().getRootDir();
|
||||||
Path tabledir = CommonFSUtils.getTableDir(rootdir, daughter.getTable());
|
Path tabledir = CommonFSUtils.getTableDir(rootdir, tableName);
|
||||||
|
Path regionDir = new Path(tabledir, region.getEncodedName());
|
||||||
Path daughterRegionDir = new Path(tabledir, daughter.getEncodedName());
|
|
||||||
|
|
||||||
HRegionFileSystem regionFs;
|
|
||||||
|
|
||||||
try {
|
try {
|
||||||
if (!CommonFSUtils.isExists(fs, daughterRegionDir)) {
|
if (!CommonFSUtils.isExists(fs, regionDir)) {
|
||||||
return new Pair<>(Boolean.FALSE, Boolean.FALSE);
|
return new Pair<>(Boolean.FALSE, Boolean.FALSE);
|
||||||
}
|
}
|
||||||
} catch (IOException ioe) {
|
} catch (IOException ioe) {
|
||||||
LOG.error("Error trying to determine if daughter region exists, "
|
LOG.error("Error trying to determine if region exists, assuming exists and has references",
|
||||||
+ "assuming exists and has references", ioe);
|
ioe);
|
||||||
return new Pair<>(Boolean.TRUE, Boolean.TRUE);
|
return new Pair<>(Boolean.TRUE, Boolean.TRUE);
|
||||||
}
|
}
|
||||||
|
|
||||||
boolean references = false;
|
TableDescriptor tableDescriptor = services.getTableDescriptors().get(tableName);
|
||||||
TableDescriptor parentDescriptor = services.getTableDescriptors().get(parent.getTable());
|
|
||||||
try {
|
try {
|
||||||
regionFs = HRegionFileSystem.openRegionFromFileSystem(services.getConfiguration(), fs,
|
HRegionFileSystem regionFs = HRegionFileSystem
|
||||||
tabledir, daughter, true);
|
.openRegionFromFileSystem(services.getConfiguration(), fs, tabledir, region, true);
|
||||||
|
boolean references = regionFs.hasReferences(tableDescriptor);
|
||||||
for (ColumnFamilyDescriptor family : parentDescriptor.getColumnFamilies()) {
|
return new Pair<>(Boolean.TRUE, references);
|
||||||
references = regionFs.hasReferences(family.getNameAsString());
|
|
||||||
if (references) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} catch (IOException e) {
|
} catch (IOException e) {
|
||||||
LOG.error("Error trying to determine referenced files from : " + daughter.getEncodedName()
|
LOG.error("Error trying to determine if region {} has references, assuming it does",
|
||||||
+ ", to: " + parent.getEncodedName() + " assuming has references", e);
|
region.getEncodedName(), e);
|
||||||
return new Pair<>(Boolean.TRUE, Boolean.TRUE);
|
return new Pair<>(Boolean.TRUE, Boolean.TRUE);
|
||||||
}
|
}
|
||||||
return new Pair<>(Boolean.TRUE, references);
|
|
||||||
}
|
|
||||||
|
|
||||||
private TableDescriptor getDescriptor(final TableName tableName) throws IOException {
|
|
||||||
return this.services.getTableDescriptors().get(tableName);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private void updateAssignmentManagerMetrics() {
|
private void updateAssignmentManagerMetrics() {
|
||||||
|
@ -21,8 +21,12 @@ import static org.apache.hadoop.hbase.util.HFileArchiveTestingUtil.assertArchive
|
|||||||
import static org.junit.Assert.assertEquals;
|
import static org.junit.Assert.assertEquals;
|
||||||
import static org.junit.Assert.assertFalse;
|
import static org.junit.Assert.assertFalse;
|
||||||
import static org.junit.Assert.assertTrue;
|
import static org.junit.Assert.assertTrue;
|
||||||
|
import static org.mockito.Mockito.any;
|
||||||
|
import static org.mockito.Mockito.doAnswer;
|
||||||
import static org.mockito.Mockito.doReturn;
|
import static org.mockito.Mockito.doReturn;
|
||||||
|
import static org.mockito.Mockito.doThrow;
|
||||||
import static org.mockito.Mockito.spy;
|
import static org.mockito.Mockito.spy;
|
||||||
|
import static org.mockito.Mockito.when;
|
||||||
|
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
@ -34,6 +38,7 @@ import java.util.SortedMap;
|
|||||||
import java.util.SortedSet;
|
import java.util.SortedSet;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
import java.util.concurrent.ConcurrentSkipListMap;
|
import java.util.concurrent.ConcurrentSkipListMap;
|
||||||
|
import java.util.concurrent.atomic.AtomicBoolean;
|
||||||
import org.apache.hadoop.fs.FSDataOutputStream;
|
import org.apache.hadoop.fs.FSDataOutputStream;
|
||||||
import org.apache.hadoop.fs.FileStatus;
|
import org.apache.hadoop.fs.FileStatus;
|
||||||
import org.apache.hadoop.fs.FileSystem;
|
import org.apache.hadoop.fs.FileSystem;
|
||||||
@ -46,10 +51,12 @@ import org.apache.hadoop.hbase.HRegionInfo;
|
|||||||
import org.apache.hadoop.hbase.MetaMockingUtil;
|
import org.apache.hadoop.hbase.MetaMockingUtil;
|
||||||
import org.apache.hadoop.hbase.ServerName;
|
import org.apache.hadoop.hbase.ServerName;
|
||||||
import org.apache.hadoop.hbase.TableName;
|
import org.apache.hadoop.hbase.TableName;
|
||||||
|
import org.apache.hadoop.hbase.client.RegionInfo;
|
||||||
import org.apache.hadoop.hbase.client.Result;
|
import org.apache.hadoop.hbase.client.Result;
|
||||||
import org.apache.hadoop.hbase.client.TableDescriptor;
|
import org.apache.hadoop.hbase.client.TableDescriptor;
|
||||||
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
|
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
|
||||||
import org.apache.hadoop.hbase.io.Reference;
|
import org.apache.hadoop.hbase.io.Reference;
|
||||||
|
import org.apache.hadoop.hbase.master.MasterFileSystem;
|
||||||
import org.apache.hadoop.hbase.master.MasterServices;
|
import org.apache.hadoop.hbase.master.MasterServices;
|
||||||
import org.apache.hadoop.hbase.master.assignment.MockMasterServices;
|
import org.apache.hadoop.hbase.master.assignment.MockMasterServices;
|
||||||
import org.apache.hadoop.hbase.master.janitor.CatalogJanitor.SplitParentFirstComparator;
|
import org.apache.hadoop.hbase.master.janitor.CatalogJanitor.SplitParentFirstComparator;
|
||||||
@ -114,6 +121,100 @@ public class TestCatalogJanitor {
|
|||||||
this.masterServices.stop("DONE");
|
this.masterServices.stop("DONE");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testCleanMerge() throws IOException {
|
||||||
|
TableDescriptor td = createTableDescriptorForCurrentMethod();
|
||||||
|
// Create regions.
|
||||||
|
HRegionInfo merged =
|
||||||
|
new HRegionInfo(td.getTableName(), Bytes.toBytes("aaa"), Bytes.toBytes("eee"));
|
||||||
|
HRegionInfo parenta =
|
||||||
|
new HRegionInfo(td.getTableName(), Bytes.toBytes("aaa"), Bytes.toBytes("ccc"));
|
||||||
|
HRegionInfo parentb =
|
||||||
|
new HRegionInfo(td.getTableName(), Bytes.toBytes("ccc"), Bytes.toBytes("eee"));
|
||||||
|
|
||||||
|
List<RegionInfo> parents = new ArrayList<>();
|
||||||
|
parents.add(parenta);
|
||||||
|
parents.add(parentb);
|
||||||
|
|
||||||
|
Path rootdir = this.masterServices.getMasterFileSystem().getRootDir();
|
||||||
|
Path tabledir = CommonFSUtils.getTableDir(rootdir, td.getTableName());
|
||||||
|
Path storedir =
|
||||||
|
HRegionFileSystem.getStoreHomedir(tabledir, merged, td.getColumnFamilies()[0].getName());
|
||||||
|
|
||||||
|
Path parentaRef = createMergeReferenceFile(storedir, merged, parenta);
|
||||||
|
Path parentbRef = createMergeReferenceFile(storedir, merged, parentb);
|
||||||
|
|
||||||
|
// references exist, should not clean
|
||||||
|
assertFalse(CatalogJanitor.cleanMergeRegion(masterServices, merged, parents));
|
||||||
|
|
||||||
|
masterServices.getMasterFileSystem().getFileSystem().delete(parentaRef, false);
|
||||||
|
|
||||||
|
// one reference still exists, should not clean
|
||||||
|
assertFalse(CatalogJanitor.cleanMergeRegion(masterServices, merged, parents));
|
||||||
|
|
||||||
|
masterServices.getMasterFileSystem().getFileSystem().delete(parentbRef, false);
|
||||||
|
|
||||||
|
// all references removed, should clean
|
||||||
|
assertTrue(CatalogJanitor.cleanMergeRegion(masterServices, merged, parents));
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testDontCleanMergeIfFileSystemException() throws IOException {
|
||||||
|
TableDescriptor td = createTableDescriptorForCurrentMethod();
|
||||||
|
// Create regions.
|
||||||
|
HRegionInfo merged =
|
||||||
|
new HRegionInfo(td.getTableName(), Bytes.toBytes("aaa"), Bytes.toBytes("eee"));
|
||||||
|
HRegionInfo parenta =
|
||||||
|
new HRegionInfo(td.getTableName(), Bytes.toBytes("aaa"), Bytes.toBytes("ccc"));
|
||||||
|
HRegionInfo parentb =
|
||||||
|
new HRegionInfo(td.getTableName(), Bytes.toBytes("ccc"), Bytes.toBytes("eee"));
|
||||||
|
|
||||||
|
List<RegionInfo> parents = new ArrayList<>();
|
||||||
|
parents.add(parenta);
|
||||||
|
parents.add(parentb);
|
||||||
|
|
||||||
|
Path rootdir = this.masterServices.getMasterFileSystem().getRootDir();
|
||||||
|
Path tabledir = CommonFSUtils.getTableDir(rootdir, td.getTableName());
|
||||||
|
Path storedir =
|
||||||
|
HRegionFileSystem.getStoreHomedir(tabledir, merged, td.getColumnFamilies()[0].getName());
|
||||||
|
createMergeReferenceFile(storedir, merged, parenta);
|
||||||
|
|
||||||
|
MasterServices mockedMasterServices = spy(masterServices);
|
||||||
|
MasterFileSystem mockedMasterFileSystem = spy(masterServices.getMasterFileSystem());
|
||||||
|
FileSystem mockedFileSystem = spy(masterServices.getMasterFileSystem().getFileSystem());
|
||||||
|
|
||||||
|
when(mockedMasterServices.getMasterFileSystem()).thenReturn(mockedMasterFileSystem);
|
||||||
|
when(mockedMasterFileSystem.getFileSystem()).thenReturn(mockedFileSystem);
|
||||||
|
|
||||||
|
// throw on the first exists check
|
||||||
|
doThrow(new IOException("Some exception")).when(mockedFileSystem).exists(any());
|
||||||
|
|
||||||
|
assertFalse(CatalogJanitor.cleanMergeRegion(mockedMasterServices, merged, parents));
|
||||||
|
|
||||||
|
// throw on the second exists check (within HRegionfileSystem)
|
||||||
|
AtomicBoolean returned = new AtomicBoolean(false);
|
||||||
|
doAnswer(invocationOnMock -> {
|
||||||
|
if (!returned.get()) {
|
||||||
|
returned.set(true);
|
||||||
|
return masterServices.getMasterFileSystem().getFileSystem()
|
||||||
|
.exists(invocationOnMock.getArgument(0));
|
||||||
|
}
|
||||||
|
throw new IOException("Some exception");
|
||||||
|
}).when(mockedFileSystem).exists(any());
|
||||||
|
|
||||||
|
assertFalse(CatalogJanitor.cleanMergeRegion(mockedMasterServices, merged, parents));
|
||||||
|
}
|
||||||
|
|
||||||
|
private Path createMergeReferenceFile(Path storeDir, HRegionInfo mergedRegion,
|
||||||
|
HRegionInfo parentRegion) throws IOException {
|
||||||
|
Reference ref = Reference.createTopReference(mergedRegion.getStartKey());
|
||||||
|
long now = EnvironmentEdgeManager.currentTime();
|
||||||
|
// Reference name has this format: StoreFile#REF_NAME_PARSER
|
||||||
|
Path p = new Path(storeDir, Long.toString(now) + "." + parentRegion.getEncodedName());
|
||||||
|
FileSystem fs = this.masterServices.getMasterFileSystem().getFileSystem();
|
||||||
|
return ref.write(fs, p);
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Test clearing a split parent.
|
* Test clearing a split parent.
|
||||||
*/
|
*/
|
||||||
|
Loading…
x
Reference in New Issue
Block a user