HDDS-1365. Fix error handling in KeyValueContainerCheck. Contributed by Supratim Deka.

This commit is contained in:
Yiqun Lin 2019-04-03 14:01:30 +08:00
parent aaaf856f4b
commit f96fb05a2b
2 changed files with 81 additions and 215 deletions

View File

@ -68,9 +68,26 @@ public class KeyValueContainerCheck {
}
/**
* fast checks are basic and do not look inside the metadata files.
* Or into the structures on disk. These checks can be done on Open
* containers as well without concurrency implications
* Run basic integrity checks on container metadata.
* These checks do not look inside the metadata files.
* Applicable for OPEN containers.
*
* @return true : corruption detected, false : no corruption.
*/
public boolean fastCheck() {
boolean corruption = false;
try {
basicChecks();
} catch (IOException e) {
handleCorruption(e);
corruption = true;
}
return corruption;
}
/**
* Checks :
* 1. check directory layout
* 2. check container file
@ -78,24 +95,14 @@ public class KeyValueContainerCheck {
* @return void
*/
public KvCheckError fastCheck() {
private void basicChecks() throws IOException {
KvCheckError error;
LOG.trace("Running fast check for container {};", containerID);
LOG.trace("Running basic checks for container {};", containerID);
error = loadContainerData();
if (error != KvCheckError.ERROR_NONE) {
return error;
}
loadContainerData();
error = checkLayout();
if (error != KvCheckError.ERROR_NONE) {
return error;
}
error = checkContainerFile();
return error;
checkLayout();
checkContainerFile();
}
/**
@ -107,129 +114,80 @@ public class KeyValueContainerCheck {
* <p>
* fullCheck is a superset of fastCheck
*
* @return void
* @return true : corruption detected, false : no corruption.
*/
public KvCheckError fullCheck() {
/**
public boolean fullCheck() {
boolean corruption = false;
*/
KvCheckError error;
try {
basicChecks();
checkBlockDB();
error = fastCheck();
if (error != KvCheckError.ERROR_NONE) {
LOG.trace("fastCheck failed, aborting full check for Container {}",
containerID);
return error;
} catch (IOException e) {
handleCorruption(e);
corruption = true;
}
error = checkBlockDB();
return error;
return corruption;
}
/**
* Check the integrity of the directory structure of the container.
*
* @return error code or ERROR_NONE
*/
private KvCheckError checkLayout() {
boolean success;
KvCheckError error = KvCheckError.ERROR_NONE;
private void checkLayout() throws IOException {
// is metadataPath accessible as a directory?
try {
checkDirPath(metadataPath);
} catch (IOException ie) {
error = KvCheckError.METADATA_PATH_ACCESS;
handleCorruption(ie.getMessage(), error, ie);
return error;
}
checkDirPath(metadataPath);
String chunksPath = onDiskContainerData.getChunksPath();
// is chunksPath accessible as a directory?
try {
checkDirPath(chunksPath);
} catch (IOException ie) {
error = KvCheckError.CHUNKS_PATH_ACCESS;
handleCorruption(ie.getMessage(), error, ie);
return error;
}
return error;
String chunksPath = onDiskContainerData.getChunksPath();
checkDirPath(chunksPath);
}
private void checkDirPath(String path) throws IOException {
File dirPath = new File(path);
String errStr = null;
boolean success = true;
try {
if (!dirPath.isDirectory()) {
success = false;
errStr = "Not a directory [" + path + "]";
throw new IOException(errStr);
}
} catch (SecurityException se) {
throw new IOException("Security exception checking dir ["
+ path + "]", se);
} catch (Exception e) {
throw new IOException("Generic exception checking dir ["
+ path + "]", e);
}
try {
String[] ls = dirPath.list();
if (ls == null) {
// null result implies operation failed
success = false;
errStr = "null listing for directory [" + path + "]";
}
} catch (Exception e) {
throw new IOException("Exception listing dir [" + path + "]", e);
}
if (!success) {
Preconditions.checkState(errStr != null);
String[] ls = dirPath.list();
if (ls == null) {
// null result implies operation failed
errStr = "null listing for directory [" + path + "]";
throw new IOException(errStr);
}
}
private KvCheckError checkContainerFile() {
private void checkContainerFile() throws IOException {
/**
* compare the values in the container file loaded from disk,
* with the values we are expecting
*/
KvCheckError error = KvCheckError.ERROR_NONE;
String dbType;
Preconditions
.checkState(onDiskContainerData != null, "Container File not loaded");
KvCheckAction next;
try {
ContainerUtils.verifyChecksum(onDiskContainerData);
} catch (Exception e) {
error = KvCheckError.CONTAINERDATA_CKSUM;
handleCorruption("Container File Checksum mismatch", error, e);
return error;
}
ContainerUtils.verifyChecksum(onDiskContainerData);
if (onDiskContainerData.getContainerType()
!= ContainerProtos.ContainerType.KeyValueContainer) {
String errStr = "Bad Container type in Containerdata for " + containerID;
error = KvCheckError.CONTAINERDATA_TYPE;
handleCorruption(errStr, error, null);
return error; // Abort if we do not know the type of Container
throw new IOException(errStr);
}
if (onDiskContainerData.getContainerID() != containerID) {
String errStr =
"Bad ContainerID field in Containerdata for " + containerID;
error = KvCheckError.CONTAINERDATA_ID;
next = handleCorruption(errStr, error, null);
if (next == KvCheckAction.ABORT) {
return error;
} // else continue checking other data elements
throw new IOException(errStr);
}
dbType = onDiskContainerData.getContainerDBType();
@ -237,9 +195,7 @@ public class KeyValueContainerCheck {
!dbType.equals(OZONE_METADATA_STORE_IMPL_LEVELDB)) {
String errStr = "Unknown DBType [" + dbType
+ "] in Container File for [" + containerID + "]";
error = KvCheckError.CONTAINERDATA_DBTYPE;
handleCorruption(errStr, error, null);
return error;
throw new IOException(errStr);
}
KeyValueContainerData kvData = onDiskContainerData;
@ -248,17 +204,11 @@ public class KeyValueContainerCheck {
"Bad metadata path in Containerdata for " + containerID + "Expected ["
+ metadataPath.toString() + "] Got [" + kvData.getMetadataPath()
+ "]";
error = KvCheckError.CONTAINERDATA_METADATA_PATH;
next = handleCorruption(errStr, error, null);
if (next == KvCheckAction.ABORT) {
return error;
}
throw new IOException(errStr);
}
return error;
}
private KvCheckError checkBlockDB() {
private void checkBlockDB() throws IOException {
/**
* Check the integrity of the DB inside each container.
* In Scope:
@ -269,52 +219,31 @@ public class KeyValueContainerCheck {
* 1. chunk checksum verification. this is left to a separate
* slow chunk scanner
*/
KvCheckError error;
Preconditions.checkState(onDiskContainerData != null,
"invoke loadContainerData prior to calling this function");
File dbFile;
File metaDir = new File(metadataPath);
try {
dbFile = KeyValueContainerLocationUtil
.getContainerDBFile(metaDir, containerID);
dbFile = KeyValueContainerLocationUtil
.getContainerDBFile(metaDir, containerID);
if (!dbFile.exists() || !dbFile.canRead()) {
String dbFileErrorMsg = "Unable to access DB File [" + dbFile.toString()
+ "] for Container [" + containerID + "] metadata path ["
+ metadataPath + "]";
error = KvCheckError.DB_ACCESS;
handleCorruption(dbFileErrorMsg, error, null);
return error;
}
} catch (Exception e) {
String dbFileErrorMessage =
"Exception when initializing DBFile" + "with metadatapath ["
+ metadataPath + "] for Container [" + containerID
+ "]";
error = KvCheckError.DB_ACCESS;
handleCorruption(dbFileErrorMessage, error, e);
return error;
if (!dbFile.exists() || !dbFile.canRead()) {
String dbFileErrorMsg = "Unable to access DB File [" + dbFile.toString()
+ "] for Container [" + containerID + "] metadata path ["
+ metadataPath + "]";
throw new IOException(dbFileErrorMsg);
}
onDiskContainerData.setDbFile(dbFile);
MetadataStore db = BlockUtils
.getDB(onDiskContainerData, checkConfig);
try {
MetadataStore db = BlockUtils
.getDB(onDiskContainerData, checkConfig);
error = iterateBlockDB(db);
} catch (Exception e) {
error = KvCheckError.DB_ITERATOR;
handleCorruption("Block DB Iterator aborted", error, e);
return error;
}
return error;
iterateBlockDB(db);
}
private KvCheckError iterateBlockDB(MetadataStore db)
private void iterateBlockDB(MetadataStore db)
throws IOException {
KvCheckError error = KvCheckError.ERROR_NONE;
Preconditions.checkState(db != null);
// get "normal" keys from the Block DB
@ -328,103 +257,39 @@ public class KeyValueContainerCheck {
List<ContainerProtos.ChunkInfo> chunkInfoList = block.getChunks();
for (ContainerProtos.ChunkInfo chunk : chunkInfoList) {
File chunkFile;
try {
chunkFile = ChunkUtils
.getChunkFile(onDiskContainerData,
ChunkInfo.getFromProtoBuf(chunk));
} catch (Exception e) {
error = KvCheckError.MISSING_CHUNK_FILE;
handleCorruption("Unable to access chunk path", error, e);
return error;
}
chunkFile = ChunkUtils.getChunkFile(onDiskContainerData,
ChunkInfo.getFromProtoBuf(chunk));
if (!chunkFile.exists()) {
error = KvCheckError.MISSING_CHUNK_FILE;
// concurrent mutation in Block DB? lookup the block again.
byte[] bdata = db.get(
Longs.toByteArray(block.getBlockID().getLocalID()));
if (bdata == null) {
LOG.trace("concurrency with delete, ignoring deleted block");
error = KvCheckError.ERROR_NONE;
break; // skip to next block from kvIter
} else {
handleCorruption("Missing chunk file", error, null);
return error;
String errorStr = "Missing chunk file "
+ chunkFile.getAbsolutePath();
throw new IOException(errorStr);
}
}
}
}
return error;
}
private KvCheckError loadContainerData() {
KvCheckError error = KvCheckError.ERROR_NONE;
private void loadContainerData() throws IOException {
File containerFile = KeyValueContainer
.getContainerFile(metadataPath.toString(), containerID);
try {
onDiskContainerData = (KeyValueContainerData) ContainerDataYaml
.readContainerFile(containerFile);
} catch (IOException e) {
error = KvCheckError.FILE_LOAD;
handleCorruption("Unable to load Container File", error, e);
}
return error;
onDiskContainerData = (KeyValueContainerData) ContainerDataYaml
.readContainerFile(containerFile);
}
private KvCheckAction handleCorruption(String reason,
KvCheckError error, Exception e) {
// XXX HDDS-1201 need to implement corruption handling/reporting
private void handleCorruption(IOException e) {
String errStr =
"Corruption detected in container: [" + containerID + "] reason: ["
+ reason + "] error code: [" + error + "]";
String logMessage = null;
StackTraceElement[] stackeElems = Thread.currentThread().getStackTrace();
String caller =
"Corruption reported from Source File: [" + stackeElems[2].getFileName()
+ "] Line: [" + stackeElems[2].getLineNumber() + "]";
if (e != null) {
logMessage = errStr + " exception: [" + e.getMessage() + "]";
e.printStackTrace();
} else {
logMessage = errStr;
}
LOG.error(caller);
"Corruption detected in container: [" + containerID + "] ";
String logMessage = errStr + "Exception: [" + e.getMessage() + "]";
LOG.error(logMessage);
return KvCheckAction.ABORT;
}
/**
* Pre-defined error codes for Container Metadata check.
*/
public enum KvCheckError {
ERROR_NONE,
FILE_LOAD, // unable to load container metafile
METADATA_PATH_ACCESS, // metadata path is not accessible
CHUNKS_PATH_ACCESS, // chunks path is not accessible
CONTAINERDATA_ID, // bad Container-ID stored in Container file
CONTAINERDATA_METADATA_PATH, // bad metadata path in Container file
CONTAINERDATA_CHUNKS_PATH, // bad chunks path in Container file
CONTAINERDATA_CKSUM, // container file checksum mismatch
CONTAINERDATA_TYPE, // container file incorrect type of Container
CONTAINERDATA_DBTYPE, // unknown DB Type specified in Container File
DB_ACCESS, // unable to load Metastore DB
DB_ITERATOR, // unable to create block iterator for Metastore DB
MISSING_CHUNK_FILE // chunk file not found
}
private enum KvCheckAction {
CONTINUE, // Continue with remaining checks on the corrupt Container
ABORT // Abort checks for the container
}
}

View File

@ -55,6 +55,7 @@ import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_METADATA_STORE_IMPL;
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_METADATA_STORE_IMPL_LEVELDB;
import static org.apache.hadoop.ozone.OzoneConfigKeys.OZONE_METADATA_STORE_IMPL_ROCKSDB;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
/**
@ -100,7 +101,7 @@ import static org.junit.Assert.assertTrue;
int deletedBlocks = 1;
int normalBlocks = 3;
int chunksPerBlock = 4;
KeyValueContainerCheck.KvCheckError error;
boolean corruption = false;
// test Closed Container
createContainerWithBlocks(containerID, normalBlocks, deletedBlocks, 65536,
@ -114,14 +115,14 @@ import static org.junit.Assert.assertTrue;
containerID);
// first run checks on a Open Container
error = kvCheck.fastCheck();
assertTrue(error == KeyValueContainerCheck.KvCheckError.ERROR_NONE);
corruption = kvCheck.fastCheck();
assertFalse(corruption);
container.close();
// next run checks on a Closed Container
error = kvCheck.fullCheck();
assertTrue(error == KeyValueContainerCheck.KvCheckError.ERROR_NONE);
corruption = kvCheck.fullCheck();
assertFalse(corruption);
}
/**