diff --git a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/Driver.java b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/Driver.java index 18f1617b877..d52a31067f4 100644 --- a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/Driver.java +++ b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mapreduce/Driver.java @@ -20,6 +20,7 @@ package org.apache.hadoop.hbase.mapreduce; import org.apache.hadoop.hbase.HBaseInterfaceAudience; import org.apache.hadoop.hbase.mapreduce.replication.VerifyReplication; +import org.apache.hadoop.hbase.mob.mapreduce.MobRefReporter; import org.apache.hadoop.hbase.snapshot.ExportSnapshot; import org.apache.hadoop.hbase.tool.BulkLoadHFilesTool; import org.apache.hadoop.util.ProgramDriver; @@ -55,6 +56,8 @@ public class Driver { pgd.addClass(WALPlayer.NAME, WALPlayer.class, "Replay WAL files."); pgd.addClass(ExportSnapshot.NAME, ExportSnapshot.class, "Export" + " the specific snapshot to a given FileSystem."); + pgd.addClass(MobRefReporter.NAME, MobRefReporter.class, "Check the mob cells in a particular " + + "table and cf and confirm that the files they point to are correct."); ProgramDriver.class.getMethod("driver", new Class [] {String[].class}). invoke(pgd, new Object[]{args}); diff --git a/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mob/mapreduce/MobRefReporter.java b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mob/mapreduce/MobRefReporter.java new file mode 100644 index 00000000000..929d64a289b --- /dev/null +++ b/hbase-mapreduce/src/main/java/org/apache/hadoop/hbase/mob/mapreduce/MobRefReporter.java @@ -0,0 +1,509 @@ +/** + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.mob.mapreduce; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Base64; +import java.util.HashSet; +import java.util.Set; +import java.util.UUID; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.conf.Configured; +import org.apache.hadoop.fs.FileStatus; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hbase.Cell; +import org.apache.hadoop.hbase.HBaseConfiguration; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.Admin; +import org.apache.hadoop.hbase.client.ColumnFamilyDescriptor; +import org.apache.hadoop.hbase.client.Connection; +import org.apache.hadoop.hbase.client.ConnectionFactory; +import org.apache.hadoop.hbase.client.Result; +import org.apache.hadoop.hbase.client.Scan; +import org.apache.hadoop.hbase.client.TableDescriptor; +import org.apache.hadoop.hbase.io.HFileLink; +import org.apache.hadoop.hbase.io.ImmutableBytesWritable; +import org.apache.hadoop.hbase.mapreduce.TableInputFormat; +import org.apache.hadoop.hbase.mapreduce.TableMapReduceUtil; +import org.apache.hadoop.hbase.mapreduce.TableMapper; +import org.apache.hadoop.hbase.mob.MobConstants; +import org.apache.hadoop.hbase.mob.MobUtils; +import org.apache.hadoop.hbase.util.Bytes; +import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; +import org.apache.hadoop.hbase.util.FSUtils; +import org.apache.hadoop.hbase.util.HFileArchiveUtil; +import org.apache.hadoop.hbase.util.Pair; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Reducer; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.security.UserGroupInformation; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.yetus.audience.InterfaceAudience; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +/** + * Scans a given table + CF for all mob reference cells to get the list of backing mob files. + * For each referenced file we attempt to verify that said file is on the FileSystem in a place + * that the MOB system will look when attempting to resolve the actual value. + * + * The job includes counters that can help provide a rough sketch of the mob data. + * + *
+ * Map-Reduce Framework
+ *         Map input records=10000
+ * ...
+ *         Reduce output records=99
+ * ...
+ * CELLS PER ROW
+ *         Number of rows with 1s of cells per row=10000
+ * MOB
+ *         NUM_CELLS=52364
+ * PROBLEM
+ *         Affected rows=338
+ *         Problem MOB files=2
+ * ROWS WITH PROBLEMS PER FILE
+ *         Number of HFiles with 100s of affected rows=2
+ * SIZES OF CELLS
+ *         Number of cells with size in the 10,000s of bytes=627
+ *         Number of cells with size in the 100,000s of bytes=51392
+ *         Number of cells with size in the 1,000,000s of bytes=345
+ * SIZES OF ROWS
+ *         Number of rows with total size in the 100,000s of bytes=6838
+ *         Number of rows with total size in the 1,000,000s of bytes=3162
+ * 
+ * + * * Map-Reduce Framework:Map input records - the number of rows with mob references + * * Map-Reduce Framework:Reduce output records - the number of unique hfiles referenced + * * MOB:NUM_CELLS - the total number of mob reference cells + * * PROBLEM:Affected rows - the number of rows that reference hfiles with an issue + * * PROBLEM:Problem MOB files - the number of unique hfiles that have an issue + * * CELLS PER ROW: - this counter group gives a histogram of the order of magnitude of the + * number of cells in a given row by grouping by the number of digits used in each count. + * This allows us to see more about the distribution of cells than what we can determine + * with just the cell count and the row count. In this particular example we can see that + * all of our rows have somewhere between 1 - 9 cells. + * * ROWS WITH PROBLEMS PER FILE: - this counter group gives a histogram of the order of + * magnitude of the number of rows in each of the hfiles with a problem. e.g. in the + * example there are 2 hfiles and they each have the same order of magnitude number of rows, + * specifically between 100 and 999. + * * SIZES OF CELLS: - this counter group gives a histogram of the order of magnitude of + * the size of mob values according to our reference cells. e.g. in the example above we + * have cell sizes that are all between 10,000 bytes and 9,999,999 bytes. From this + * histogram we can also see that _most_ cells are 100,000 - 999,000 bytes and the smaller + * and bigger ones are outliers making up less than 2% of mob cells. + * * SIZES OF ROWS: - this counter group gives a histogram of the order of magnitude of the + * size of mob values across each row according to our reference cells. In the example above + * we have rows that are are between 100,000 bytes and 9,999,999 bytes. We can also see that + * about 2/3rd of our rows are 100,000 - 999,999 bytes. + * + * Generates a report that gives one file status per line, with tabs dividing fields. + * + *
+ * RESULT OF LOOKUP	FILE REF	comma seperated, base64 encoded rows when there's a problem
+ * 
+ * + * e.g. + * + *
+ * MOB DIR	09c576e28a65ed2ead0004d192ffaa382019110184b30a1c7e034573bf8580aef8393402
+ * MISSING FILE    28e252d7f013973174750d483d358fa020191101f73536e7133f4cd3ab1065edf588d509        MmJiMjMyYzBiMTNjNzc0OTY1ZWY4NTU4ZjBmYmQ2MTUtNTIz,MmEzOGE0YTkzMTZjNDllNWE4MzM1MTdjNDVkMzEwNzAtODg=
+ * 
+ * + * Possible results are listed; the first three indicate things are working properly. + * * MOB DIR - the reference is in the normal MOB area for the given table and CF + * * HLINK TO ARCHIVE FOR SAME TABLE - the reference is present in the archive area for this + * table and CF + * * HLINK TO ARCHIVE FOR OTHER TABLE - the reference is present in a different table and CF, + * either in the MOB or archive areas (e.g. from a snapshot restore or clone) + * * ARCHIVE WITH HLINK BUT NOT FROM OUR TABLE - the reference is currently present in the archive + * area for this table and CF, but it is kept there because a _different_ table has a + * reference to it (e.g. from a snapshot clone). If these other tables are removed then + * the file will likely be deleted unless there is a snapshot also referencing it. + * * ARCHIVE BUT NO HLINKS - the reference is currently present in the archive for this table and + * CF, but there are no references present to prevent its removal. Unless it is newer than + * the general TTL (default 5 minutes) or referenced in a snapshot it will be subject to + * cleaning. + * * ARCHIVE BUT FAILURE WHILE CHECKING HLINKS - Check the job logs to see why things failed while + * looking for why this file is being kept around. + * * MISSING FILE - We couldn't find the reference on the FileSystem. Either there is dataloss due + * to a bug in the MOB storage system or the MOB storage is damaged but in an edge case that + * allows it to work for now. You can verify which by doing a raw reference scan to get the + * referenced hfile and check the underlying filesystem. See the ref guide section on mob + * for details. + * * HLINK BUT POINT TO MISSING FILE - There is a pointer in our mob area for this table and CF + * to a file elsewhere on the FileSystem, however the file it points to no longer exists. + * * MISSING FILE BUT FAILURE WHILE CHECKING HLINKS - We could not find the referenced file, + * however you should check the job logs to see why we couldn't check to see if there is a + * pointer to the referenced file in our archive or another table's archive or mob area. + * + */ +@InterfaceAudience.Private +public class MobRefReporter extends Configured implements Tool { + private static Logger LOG = LoggerFactory.getLogger(MobRefReporter.class); + public static final String NAME = "mobrefs"; + static final String REPORT_JOB_ID = "mob.report.job.id"; + static final String REPORT_START_DATETIME = "mob.report.job.start"; + + public static class MobRefMapper extends TableMapper { + @Override + public void map(ImmutableBytesWritable r, Result columns, Context context) throws IOException, + InterruptedException { + if (columns == null) { + return; + } + Cell[] cells = columns.rawCells(); + if (cells == null || cells.length == 0) { + return; + } + Set files = new HashSet<>(); + long count = 0; + long size = 0; + for (Cell c : cells) { + if (MobUtils.hasValidMobRefCellValue(c)) { + // TODO confirm there aren't tags + String fileName = MobUtils.getMobFileName(c); + if (!files.contains(fileName)) { + context.write(new Text(fileName), r); + files.add(fileName); + } + final int cellsize = MobUtils.getMobValueLength(c); + context.getCounter("SIZES OF CELLS", "Number of cells with size in the " + + log10GroupedString(cellsize) + "s of bytes").increment(1L); + size += cellsize; + count++; + } else { + LOG.debug("cell is not a mob ref, even though we asked for only refs. cell={}", c); + } + } + context.getCounter("CELLS PER ROW", "Number of rows with " + log10GroupedString(count) + + "s of cells per row").increment(1L); + context.getCounter("SIZES OF ROWS", "Number of rows with total size in the " + + log10GroupedString(size) + "s of bytes").increment(1L); + context.getCounter("MOB","NUM_CELLS").increment(count); + } + } + + public static class MobRefReducer extends + Reducer { + + TableName table; + String mobRegion; + Path mob; + Path archive; + String seperator; + + /* Results that mean things are fine */ + final Text OK_MOB_DIR = new Text("MOB DIR"); + final Text OK_HLINK_RESTORE = new Text("HLINK TO ARCHIVE FOR SAME TABLE"); + final Text OK_HLINK_CLONE = new Text("HLINK TO ARCHIVE FOR OTHER TABLE"); + /* Results that mean something is incorrect */ + final Text INCONSISTENT_ARCHIVE_BAD_LINK = + new Text("ARCHIVE WITH HLINK BUT NOT FROM OUR TABLE"); + final Text INCONSISTENT_ARCHIVE_STALE = new Text("ARCHIVE BUT NO HLINKS"); + final Text INCONSISTENT_ARCHIVE_IOE = new Text("ARCHIVE BUT FAILURE WHILE CHECKING HLINKS"); + /* Results that mean data is probably already gone */ + final Text DATALOSS_MISSING = new Text("MISSING FILE"); + final Text DATALOSS_HLINK_DANGLING = new Text("HLINK BUT POINTS TO MISSING FILE"); + final Text DATALOSS_MISSING_IOE = new Text("MISSING FILE BUT FAILURE WHILE CHECKING HLINKS"); + final Base64.Encoder base64 = Base64.getEncoder(); + + @Override + public void setup(Context context) throws IOException, InterruptedException { + final Configuration conf = context.getConfiguration(); + final String tableName = conf.get(TableInputFormat.INPUT_TABLE); + if (null == tableName) { + throw new IOException("Job configuration did not include table."); + } + table = TableName.valueOf(tableName); + mobRegion = MobUtils.getMobRegionInfo(table).getEncodedName(); + final String family = conf.get(TableInputFormat.SCAN_COLUMN_FAMILY); + if (null == family) { + throw new IOException("Job configuration did not include column family"); + } + mob = MobUtils.getMobFamilyPath(conf, table, family); + LOG.info("Using active mob area '{}'", mob); + archive = HFileArchiveUtil.getStoreArchivePath(conf, table, + MobUtils.getMobRegionInfo(table).getEncodedName(), family); + LOG.info("Using archive mob area '{}'", archive); + seperator = conf.get(TextOutputFormat.SEPERATOR, "\t"); + } + + @Override + public void reduce(Text key, Iterable rows, Context context) + throws IOException, InterruptedException { + final Configuration conf = context.getConfiguration(); + final String file = key.toString(); + // active mob area + if (mob.getFileSystem(conf).exists(new Path(mob, file))) { + LOG.debug("Found file '{}' in mob area", file); + context.write(OK_MOB_DIR, key); + // archive area - is there an hlink back reference (from a snapshot from same table) + } else if (archive.getFileSystem(conf).exists(new Path(archive, file))) { + + Path backRefDir = HFileLink.getBackReferencesDir(archive, file); + try { + FileStatus[] backRefs = FSUtils.listStatus(archive.getFileSystem(conf), backRefDir); + if (backRefs != null) { + boolean found = false; + for (FileStatus backRef : backRefs) { + Pair refParts = HFileLink.parseBackReferenceName( + backRef.getPath().getName()); + if (table.equals(refParts.getFirst()) && mobRegion.equals(refParts.getSecond())) { + Path hlinkPath = HFileLink.getHFileFromBackReference(MobUtils.getMobHome(conf), + backRef.getPath()); + if (hlinkPath.getFileSystem(conf).exists(hlinkPath)) { + found = true; + } else { + LOG.warn("Found file '{}' in archive area with a back reference to the mob area " + + "for our table, but the mob area does not have a corresponding hfilelink.", + file); + } + } + } + if (found) { + LOG.debug("Found file '{}' in archive area. has proper hlink back references to " + + "suggest it is from a restored snapshot for this table.", file); + context.write(OK_HLINK_RESTORE, key); + } else { + LOG.warn("Found file '{}' in archive area, but the hlink back references do not " + + "properly point to the mob area for our table.", file); + context.write(INCONSISTENT_ARCHIVE_BAD_LINK, encodeRows(context, key, rows)); + } + } else { + LOG.warn("Found file '{}' in archive area, but there are no hlinks pointing to it. Not " + + "yet used snapshot or an error.", file); + context.write(INCONSISTENT_ARCHIVE_STALE, encodeRows(context, key, rows)); + } + } catch (IOException e) { + LOG.warn("Found file '{}' in archive area, but got an error while checking " + + "on back references.", file, e); + context.write(INCONSISTENT_ARCHIVE_IOE, encodeRows(context, key, rows)); + } + + } else { + // check for an hlink in the active mob area (from a snapshot of a different table) + try { + /** + * we are doing this ourselves instead of using FSUtils.getReferenceFilePaths because + * we know the mob region never splits, so we can only have HFileLink references + * and looking for just them is cheaper then listing everything. + * + * This glob should match the naming convention for HFileLinks to our referenced hfile. + * As simplified explanation those file names look like "table=region-hfile". For details + * see the {@link HFileLink#createHFileLinkName HFileLink implementation}. + */ + FileStatus[] hlinks = mob.getFileSystem(conf).globStatus(new Path(mob + "/*=*-" + file)); + if (hlinks != null && hlinks.length != 0) { + if (hlinks.length != 1) { + LOG.warn("Found file '{}' as hfilelinks in the mob area, but there are more than " + + "one: {}", file, Arrays.deepToString(hlinks)); + } + HFileLink found = null; + for (FileStatus hlink : hlinks) { + HFileLink tmp = HFileLink.buildFromHFileLinkPattern(conf, hlink.getPath()); + if (tmp.exists(archive.getFileSystem(conf))) { + found = tmp; + break; + } else { + LOG.debug("Target file does not exist for ref {}", tmp); + } + } + if (found != null) { + LOG.debug("Found file '{}' as a ref in the mob area: {}", file, found); + context.write(OK_HLINK_CLONE, key); + } else { + LOG.warn("Found file '{}' as ref(s) in the mob area but they do not point to an hfile" + + " that exists.", file); + context.write(DATALOSS_HLINK_DANGLING, encodeRows(context, key, rows)); + } + } else { + LOG.error("Could not find referenced file '{}'. See the docs on this tool.", file); + LOG.debug("Note that we don't have the server-side tag from the mob cells that says " + + "what table the reference is originally from. So if the HFileLink in this table " + + "is missing but the referenced file is still in the table from that tag, then " + + "lookups of these impacted rows will work. Do a scan of the reference details " + + "of the cell for the hfile name and then check the entire hbase install if this " + + "table was made from a snapshot of another table. see the ref guide section on " + + "mob for details."); + context.write(DATALOSS_MISSING, encodeRows(context, key, rows)); + } + } catch (IOException e) { + LOG.error( + "Exception while checking mob area of our table for HFileLinks that point to {}", + file, e); + context.write(DATALOSS_MISSING_IOE, encodeRows(context, key, rows)); + } + } + } + + /** + * reuses the passed Text key. appends the configured seperator and then a comma seperated list + * of base64 encoded row keys + */ + private Text encodeRows(Context context, Text key, Iterable rows) + throws IOException { + StringBuilder sb = new StringBuilder(key.toString()); + sb.append(seperator); + boolean moreThanOne = false; + long count = 0; + for (ImmutableBytesWritable row : rows) { + if (moreThanOne) { + sb.append(","); + } + sb.append(base64.encodeToString(row.copyBytes())); + moreThanOne = true; + count++; + } + context.getCounter("PROBLEM", "Problem MOB files").increment(1L); + context.getCounter("PROBLEM", "Affected rows").increment(count); + context.getCounter("ROWS WITH PROBLEMS PER FILE", "Number of HFiles with " + + log10GroupedString(count) + "s of affected rows").increment(1L); + key.set(sb.toString()); + return key; + } + } + + /** + * Returns the string representation of the given number after grouping it + * into log10 buckets. e.g. 0-9 -> 1, 10-99 -> 10, ..., 100,000-999,999 -> 100,000, etc. + */ + static String log10GroupedString(long number) { + return String.format("%,d", (long)(Math.pow(10d, Math.floor(Math.log10(number))))); + } + + /** + * Main method for the tool. + * @return 0 if success, 1 for bad args. 2 if job aborted with an exception, + * 3 if mr job was unsuccessful + */ + public int run(String[] args) throws IOException, InterruptedException { + // TODO make family and table optional + if (args.length != 3) { + printUsage(); + return 1; + } + final String output = args[0]; + final String tableName = args[1]; + final String familyName = args[2]; + final long reportStartTime = EnvironmentEdgeManager.currentTime(); + Configuration conf = getConf(); + try { + FileSystem fs = FileSystem.get(conf); + // check whether the current user is the same one with the owner of hbase root + String currentUserName = UserGroupInformation.getCurrentUser().getShortUserName(); + FileStatus[] hbaseRootFileStat = fs.listStatus(new Path(conf.get(HConstants.HBASE_DIR))); + if (hbaseRootFileStat.length > 0) { + String owner = hbaseRootFileStat[0].getOwner(); + if (!owner.equals(currentUserName)) { + String errorMsg = "The current user[" + currentUserName + + "] does not have hbase root credentials." + + " If this job fails due to an inability to read HBase's internal directories, " + + "you will need to rerun as a user with sufficient permissions. The HBase superuser " + + "is a safe choice."; + LOG.warn(errorMsg); + } + } else { + LOG.error("The passed configs point to an HBase dir does not exist: {}", + conf.get(HConstants.HBASE_DIR)); + throw new IOException("The target HBase does not exist"); + } + + byte[] family; + int maxVersions; + TableName tn = TableName.valueOf(tableName); + try (Connection connection = ConnectionFactory.createConnection(conf); + Admin admin = connection.getAdmin()) { + TableDescriptor htd = admin.getDescriptor(tn); + ColumnFamilyDescriptor hcd = htd.getColumnFamily(Bytes.toBytes(familyName)); + if (hcd == null || !hcd.isMobEnabled()) { + throw new IOException("Column family " + familyName + " is not a MOB column family"); + } + family = hcd.getName(); + maxVersions = hcd.getMaxVersions(); + } + + + String id = getClass().getSimpleName() + UUID.randomUUID().toString().replace("-", ""); + Job job = null; + Scan scan = new Scan(); + scan.addFamily(family); + // Do not retrieve the mob data when scanning + scan.setAttribute(MobConstants.MOB_SCAN_RAW, Bytes.toBytes(Boolean.TRUE)); + scan.setAttribute(MobConstants.MOB_SCAN_REF_ONLY, Bytes.toBytes(Boolean.TRUE)); + // If a scanner caching value isn't set, pick a smaller default since we know we're doing + // a full table scan and don't want to impact other clients badly. + scan.setCaching(conf.getInt(HConstants.HBASE_CLIENT_SCANNER_CACHING, 10000)); + scan.setCacheBlocks(false); + scan.setMaxVersions(maxVersions); + conf.set(REPORT_JOB_ID, id); + + job = Job.getInstance(conf); + job.setJarByClass(getClass()); + TableMapReduceUtil.initTableMapperJob(tn, scan, + MobRefMapper.class, Text.class, ImmutableBytesWritable.class, job); + + job.setReducerClass(MobRefReducer.class); + job.setOutputFormatClass(TextOutputFormat.class); + TextOutputFormat.setOutputPath(job, new Path(output)); + + job.setJobName(getClass().getSimpleName() + "-" + tn + "-" + familyName); + // for use in the reducer. easier than re-parsing it out of the scan string. + job.getConfiguration().set(TableInputFormat.SCAN_COLUMN_FAMILY, familyName); + + // Use when we start this job as the base point for file "recency". + job.getConfiguration().setLong(REPORT_START_DATETIME, reportStartTime); + + if (job.waitForCompletion(true)) { + LOG.info("Finished creating report for '{}', family='{}'", tn, familyName); + } else { + System.err.println("Job was not successful"); + return 3; + } + return 0; + + } catch (ClassNotFoundException | RuntimeException | IOException | InterruptedException e) { + System.err.println("Job aborted due to exception " + e); + return 2; // job failed + } + } + + public static void main(String[] args) throws Exception { + Configuration conf = HBaseConfiguration.create(); + int ret = ToolRunner.run(conf, new MobRefReporter(), args); + System.exit(ret); + } + + private void printUsage() { + System.err.println("Usage:\n" + "--------------------------\n" + MobRefReporter.class.getName() + + " output-dir tableName familyName"); + System.err.println(" output-dir Where to write output report."); + System.err.println(" tableName The table name"); + System.err.println(" familyName The column family name"); + } + +} diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/HFileLink.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/HFileLink.java index 959a5ab71e2..2dc4aeb606a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/io/HFileLink.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/io/HFileLink.java @@ -508,7 +508,7 @@ public class HFileLink extends FileLink { return new Path(new Path(regionDir, familyPath.getName()), linkName); } - static Pair parseBackReferenceName(String name) { + public static Pair parseBackReferenceName(String name) { int separatorIndex = name.indexOf('.'); String linkRegionName = name.substring(0, separatorIndex); String tableSubstr = name.substring(separatorIndex + 1) diff --git a/src/main/asciidoc/_chapters/hbase_mob.adoc b/src/main/asciidoc/_chapters/hbase_mob.adoc index 8048772e504..913b29104c2 100644 --- a/src/main/asciidoc/_chapters/hbase_mob.adoc +++ b/src/main/asciidoc/_chapters/hbase_mob.adoc @@ -198,3 +198,116 @@ hbase> major_compact 't1', 'c1’, ‘MOB’ These commands are also available via `Admin.compact` and `Admin.majorCompact` methods. + +=== MOB architecture + +This section is derived from information found in +link:https://issues.apache.org/jira/browse/HBASE-11339[HBASE-11339]. For more information see +the attachment on that issue +"link:https://issues.apache.org/jira/secure/attachment/12724468/HBase%20MOB%20Design-v5.pdf[Base MOB Design-v5.pdf]". + +==== Overview +The MOB feature reduces the overall IO load for configured column families by storing values that +are larger than the configured threshold outside of the normal regions to avoid splits, merges, and +most importantly normal compactions. + +When a cell is first written to a region it is stored in the WAL and memstore regardless of value +size. When memstores from a column family configured to use MOB are eventually flushed two hfiles +are written simultaneously. Cells with a value smaller than the threshold size are written to a +normal region hfile. Cells with a value larger than the threshold are written into a special MOB +hfile and also have a MOB reference cell written into the normal region HFile. + +MOB reference cells have the same key as the cell they are based on. The value of the reference cell +is made up of two pieces of metadata: the size of the actual value and the MOB hfile that contains +the original cell. In addition to any tags originally written to HBase, the reference cell prepends +two additional tags. The first is a marker tag that says the cell is a MOB reference. This can be +used later to scan specifically just for reference cells. The second stores the namespace and table +at the time the MOB hfile is written out. This tag is used to optimize how the MOB system finds +the underlying value in MOB hfiles after a series of HBase snapshot operations (ref HBASE-12332). +Note that tags are only available within HBase servers and by default are not sent over RPCs. + +All MOB hfiles for a given table are managed within a logical region that does not directly serve +requests. When these MOB hfiles are created from a flush or MOB compaction they are placed in a +dedicated mob data area under the hbase root directory specific to the namespace, table, mob +logical region, and column family. In general that means a path structured like: + +---- +%HBase Root Dir%/mobdir/data/%namespace%/%table%/%logical region%/%column family%/ +---- + +With default configs, an example table named 'some_table' in the +default namespace with a MOB enabled column family named 'foo' this HDFS directory would be + +---- +/hbase/mobdir/data/default/some_table/372c1b27e3dc0b56c3a031926e5efbe9/foo/ +---- + +These MOB hfiles are maintained by special chores in the HBase Master rather than by any individual +Region Server. Specifically those chores take care of enforcing TTLs and compacting them. Note that +this compaction is primarily a matter of controlling the total number of files in HDFS because our +operational assumptions for MOB data is that it will seldom update or delete. + +When a given MOB hfile is no longer needed as a result of our compaction process it is archived just +like any normal hfile. Because the table's mob region is independent of all the normal regions it +can coexist with them in the regular archive storage area: + +---- +/hbase/archive/data/default/some_table/372c1b27e3dc0b56c3a031926e5efbe9/foo/ +---- + +The same hfile cleaning chores that take care of eventually deleting unneeded archived files from +normal regions thus also will take care of these MOB hfiles. + +=== MOB Troubleshooting + +==== Retrieving MOB metadata through the HBase Shell + +While working on troubleshooting failures in the MOB system you can retrieve some of the internal +information through the HBase shell by specifying special attributes on a scan. + +---- +hbase(main):112:0> scan 'some_table', {STARTROW => '00012-example-row-key', LIMIT => 1, +hbase(main):113:1* CACHE_BLOCKS => false, ATTRIBUTES => { 'hbase.mob.scan.raw' => '1', +hbase(main):114:2* 'hbase.mob.scan.ref.only' => '1' } } +---- + +The MOB internal information is stored as four bytes for the size of the underlying cell value and +then a UTF8 string with the name of the MOB HFile that contains the underlying cell value. Note that +by default the entirety of this serialized structure will be passed through the HBase shell's binary +string converter. That means the bytes that make up the value size will most likely be written as +escaped non-printable byte values, e.g. '\x03', unless they happen to correspond to ASCII +characters. + +Let's look at a specific example: + +---- +hbase(main):112:0> scan 'some_table', {STARTROW => '00012-example-row-key', LIMIT => 1, +hbase(main):113:1* CACHE_BLOCKS => false, ATTRIBUTES => { 'hbase.mob.scan.raw' => '1', +hbase(main):114:2* 'hbase.mob.scan.ref.only' => '1' } } +ROW COLUMN+CELL + 00012-example-row-key column=foo:bar, timestamp=1511179764, value=\x00\x02|\x94d41d8cd98f00b204 + e9800998ecf8427e19700118ffd9c244fe69488bbc9f2c77d24a3e6a +1 row(s) in 0.0130 seconds +---- + +In this case the first four bytes are `\x00\x02|\x94` which corresponds to the bytes +`[0x00, 0x02, 0x7C, 0x94]`. (Note that the third byte was printed as the ASCII character '|'.) +Decoded as an integer this gives us an underlying value size of 162,964 bytes. + +The remaining bytes give us an HFile name, +'d41d8cd98f00b204e9800998ecf8427e19700118ffd9c244fe69488bbc9f2c77d24a3e6a'. This HFile will most +likely be stored in the designated MOB storage area for this specific table. However, the file could +also be in the archive area if this table is from a restored snapshot. Furthermore, if the table is +from a cloned snapshot of a different table then the file could be in either the active or archive +area of that source table. As mentioned in the explanation of MOB reference cells above, the Region +Server will use a server side tag to optimize looking at the mob and archive area of the correct +original table when finding the MOB HFile. Since your scan is client side it can't retrieve that tag +and you'll either need to already know the lineage of your table or you'll need to search across all +tables. + +Assuming you are authenticated as a user with HBase superuser rights, you can search for it: +---- +$> hdfs dfs -find /hbase -name \ + d41d8cd98f00b204e9800998ecf8427e19700118ffd9c244fe69488bbc9f2c77d24a3e6a +/hbase/mobdir/data/default/some_table/372c1b27e3dc0b56c3a031926e5efbe9/foo/d41d8cd98f00b204e9800998ecf8427e19700118ffd9c244fe69488bbc9f2c77d24a3e6a +----