From d7178aa27c55ae06cfbd6db94d6839311dfb4089 Mon Sep 17 00:00:00 2001 From: Andrew Purtell Date: Wed, 12 Aug 2015 18:36:23 -0700 Subject: [PATCH] HBASE-13985 Add configuration to skip validating HFile format when bulk loading (Victor Xu) --- .../mapreduce/LoadIncrementalHFiles.java | 48 ++++++++++++++----- 1 file changed, 36 insertions(+), 12 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/LoadIncrementalHFiles.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/LoadIncrementalHFiles.java index 9b5e222d062..9ff8a22b14b 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/LoadIncrementalHFiles.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/mapreduce/LoadIncrementalHFiles.java @@ -169,6 +169,17 @@ public class LoadIncrementalHFiles extends Configured implements Tool { */ private static void visitBulkHFiles(final FileSystem fs, final Path bulkDir, final BulkHFileVisitor visitor) throws IOException { + visitBulkHFiles(fs, bulkDir, visitor, true); + } + + /** + * Iterate over the bulkDir hfiles. + * Skip reference, HFileLink, files starting with "_". + * Check and skip non-valid hfiles by default, or skip this validation by setting + * 'hbase.loadincremental.validate.hfile' to false. + */ + private static void visitBulkHFiles(final FileSystem fs, final Path bulkDir, + final BulkHFileVisitor visitor, final boolean validateHFile) throws IOException { if (!fs.exists(bulkDir)) { throw new FileNotFoundException("Bulkload dir " + bulkDir + " not found"); } @@ -209,16 +220,18 @@ public class LoadIncrementalHFiles extends Configured implements Tool { continue; } - // Validate HFile Format - try { - if (!HFile.isHFileFormat(fs, hfile)) { - LOG.warn("the file " + hfile + " doesn't seems to be an hfile. skipping"); + // Validate HFile Format if needed + if (validateHFile) { + try { + if (!HFile.isHFileFormat(fs, hfile)) { + LOG.warn("the file " + hfile + " doesn't seems to be an hfile. skipping"); + continue; + } + } catch (FileNotFoundException e) { + LOG.warn("the file " + hfile + " was removed"); continue; } - } catch (FileNotFoundException e) { - LOG.warn("the file " + hfile + " was removed"); - continue; - } + } visitor.bulkHFile(family, hfileStatus); } @@ -252,8 +265,8 @@ public class LoadIncrementalHFiles extends Configured implements Tool { * Walk the given directory for all HFiles, and return a Queue * containing all such files. */ - private void discoverLoadQueue(final Deque ret, final Path hfofDir) - throws IOException { + private void discoverLoadQueue(final Deque ret, final Path hfofDir, + final boolean validateHFile) throws IOException { fs = hfofDir.getFileSystem(getConf()); visitBulkHFiles(fs, hfofDir, new BulkHFileVisitor() { @Override @@ -270,7 +283,7 @@ public class LoadIncrementalHFiles extends Configured implements Tool { } ret.add(new LoadQueueItem(family, hfile.getPath())); } - }); + }, validateHFile); } /** @@ -323,7 +336,18 @@ public class LoadIncrementalHFiles extends Configured implements Tool { // happen in this thread Deque queue = new LinkedList(); try { - discoverLoadQueue(queue, hfofDir); + /* + * Checking hfile format is a time-consuming operation, we should have an option to skip + * this step when bulkloading millions of HFiles. See HBASE-13985. + */ + boolean validateHFile = getConf().getBoolean("hbase.loadincremental.validate.hfile", true); + if(!validateHFile) { + LOG.warn("You are skipping HFiles validation, it might cause some data loss if files " + + "are not correct. If you fail to read data from your table after using this " + + "option, consider removing the files and bulkload again without this option. " + + "See HBASE-13985"); + } + discoverLoadQueue(queue, hfofDir, validateHFile); // check whether there is invalid family name in HFiles to be bulkloaded Collection families = table.getTableDescriptor().getFamilies(); ArrayList familyNames = new ArrayList(families.size());