HBASE-6134 Improvement for split-worker to speed up distributed log splitting (Chunhui)

git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1349632 13f79535-47bb-0310-9956-ffa450edef68
2012-06-13 04:34:03 +00:00 · 2012-06-13 04:34:03 +00:00 · 0fdf8126c5
parent 5e3b1e4b5d
commit 0fdf8126c5
2 changed files with 212 additions and 170 deletions
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLogSplitter.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLogSplitter.java
@ -33,6 +33,14 @@ import java.util.Map;
 import java.util.Set;
 import java.util.TreeMap;
 import java.util.TreeSet;
+import java.util.concurrent.Callable;
+import java.util.concurrent.CompletionService;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.ExecutorCompletionService;
+import java.util.concurrent.Future;
+import java.util.concurrent.ThreadFactory;
+import java.util.concurrent.ThreadPoolExecutor;
+import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicReference;

 import org.apache.commons.logging.Log;
@ -58,6 +66,7 @@ import org.apache.hadoop.hbase.util.CancelableProgressable;
 import org.apache.hadoop.hbase.util.ClassSize;
 import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
 import org.apache.hadoop.hbase.util.FSUtils;
+import org.apache.hadoop.hbase.util.Threads;
 import org.apache.hadoop.hbase.zookeeper.ZKSplitLog;
 import org.apache.hadoop.io.MultipleIOException;

@ -109,6 +118,9 @@ public class HLogSplitter {

  private MonitoredTask status;

+  // Used in distributed log splitting
+  private DistributedLogSplittingHelper distributedLogSplittingHelper = null;
+

  /**
   * Create a new HLogSplitter using the given {@link Configuration} and the
@ -238,6 +250,10 @@ public class HLogSplitter {
    return outputSink.getOutputCounts();
  }

+  void setDistributedLogSplittingHelper(DistributedLogSplittingHelper helper) {
+    this.distributedLogSplittingHelper = helper;
+  }
+
  /**
   * Splits the HLog edits in the given list of logfiles (that are a mix of edits
   * on multiple regions) by region and then splits them per region directories,
@ -270,7 +286,7 @@ public class HLogSplitter {
    countTotalBytes(logfiles);
    splitSize = 0;

-    outputSink.startWriterThreads(entryBuffers);
+    outputSink.startWriterThreads();

    try {
      int i = 0;
@ -335,8 +351,7 @@ public class HLogSplitter {
   * out by region and stored.
   * <p>
   * If the log file has N regions then N recovered.edits files will be
-   * produced. There is no buffering in this code. Instead it relies on the
-   * buffering in the SequenceFileWriter.
+   * produced.
   * <p>
   * @param rootDir
   * @param tmpname
@ -357,27 +372,14 @@ public class HLogSplitter {
  public boolean splitLogFileToTemp(FileStatus logfile, String tmpname,
      CancelableProgressable reporter)
  throws IOException {
-    final Map<byte[], Object> logWriters = Collections.
-    synchronizedMap(new TreeMap<byte[], Object>(Bytes.BYTES_COMPARATOR));
    boolean isCorrupted = false;
    Preconditions.checkState(status == null);
    status = TaskMonitor.get().createStatus(
        "Splitting log file " + logfile.getPath() +
        "into a temporary staging area.");
-
-    Object BAD_WRITER = new Object();
-
-    boolean progress_failed = false;
-
    boolean skipErrors = conf.getBoolean("hbase.hlog.split.skip.errors",
        HLog.SPLIT_SKIP_ERRORS_DEFAULT);
    int interval = conf.getInt("hbase.splitlog.report.interval.loglines", 1024);
-    // How often to send a progress report (default 1/2 the zookeeper session
-    // timeout of if that not set, the split log DEFAULT_TIMEOUT)
-    int period = conf.getInt("hbase.splitlog.report.period",
-      conf.getInt("hbase.splitlog.manager.timeout", SplitLogManager.DEFAULT_TIMEOUT) / 2);
-    int numOpenedFilesBeforeReporting =
-      conf.getInt("hbase.splitlog.report.openedfiles", 3);
    Path logPath = logfile.getPath();
    long logLength = logfile.getLen();
    LOG.info("Splitting hlog: " + logPath + ", length=" + logLength);
@ -395,59 +397,38 @@ public class HLogSplitter {
      LOG.warn("Nothing to split in log file " + logPath);
      return true;
    }
-    long t = EnvironmentEdgeManager.currentTimeMillis();
-    long last_report_at = t;
-    if (reporter != null && reporter.progress() == false) {
-      status.markComplete("Failed: reporter.progress asked us to terminate");
+    this.setDistributedLogSplittingHelper(new DistributedLogSplittingHelper(
+        reporter, tmpname));
+    if (!reportProgressIfIsDistributedLogSplitting()) {
      return false;
    }
+    boolean progress_failed = false;
+    int numOpenedFilesBeforeReporting = conf.getInt("hbase.splitlog.report.openedfiles", 3);
+    int numOpenedFilesLastCheck = 0;
+    outputSink.startWriterThreads();
    // Report progress every so many edits and/or files opened (opening a file
    // takes a bit of time).
    int editsCount = 0;
-    int numNewlyOpenedFiles = 0;
    Entry entry;
    try {
      while ((entry = getNextLogLine(in,logPath, skipErrors)) != null) {
-        byte[] region = entry.getKey().getEncodedRegionName();
-        Object o = logWriters.get(region);
-        if (o == BAD_WRITER) {
-          continue;
-        }
-        WriterAndPath wap = (WriterAndPath)o;
-        if (wap == null) {
-          wap = createWAP(region, entry, rootDir, tmpname, fs, conf);
-          numNewlyOpenedFiles++;
-          if (wap == null) {
-            // ignore edits from this region. It doesn't exist anymore.
-            // It was probably already split.
-            logWriters.put(region, BAD_WRITER);
-            continue;
-          } else {
-            logWriters.put(region, wap);
-          }
-        }
-        wap.w.append(entry);
-        outputSink.updateRegionMaximumEditLogSeqNum(entry);
+        entryBuffers.appendEntry(entry);
        editsCount++;
-        // If sufficient edits have passed OR we've opened a few files, check if
-        // we should report progress.
-        if (editsCount % interval == 0 ||
-            (numNewlyOpenedFiles > numOpenedFilesBeforeReporting)) {
-          // Zero out files counter each time we fall in here.
-          numNewlyOpenedFiles = 0;
-          String countsStr = "edits=" + editsCount + ", files=" + logWriters.size();
+        // If sufficient edits have passed, check if we should report progress.
+        if (editsCount % interval == 0
+            || (outputSink.logWriters.size() - numOpenedFilesLastCheck) > numOpenedFilesBeforeReporting) {
+          numOpenedFilesLastCheck = outputSink.logWriters.size();
+          String countsStr = "edits=" + editsCount;
          status.setStatus("Split " + countsStr);
-          long t1 = EnvironmentEdgeManager.currentTimeMillis();
-          if ((t1 - last_report_at) > period) {
-            last_report_at = t;
-            if (reporter != null && reporter.progress() == false) {
-              status.markComplete("Failed: reporter.progress asked us to terminate; " + countsStr);
-              progress_failed = true;
+          if (!reportProgressIfIsDistributedLogSplitting()) {
            return false;
          }
        }
      }
-      }
+    } catch (InterruptedException ie) {
+      IOException iie = new InterruptedIOException();
+      iie.initCause(ie);
+      throw iie;
    } catch (CorruptedLogFileException e) {
      LOG.warn("Could not parse, corrupted log file " + logPath, e);
      ZKSplitLog.markCorrupted(rootDir, tmpname, fs);
@ -456,79 +437,15 @@ public class HLogSplitter {
      e = RemoteExceptionHandler.checkIOException(e);
      throw e;
    } finally {
-      boolean allWritersClosed = false;
-      try {
-        int n = 0;
-        for (Map.Entry<byte[], Object> logWritersEntry : logWriters.entrySet()) {
-          Object o = logWritersEntry.getValue();
-          long t1 = EnvironmentEdgeManager.currentTimeMillis();
-          if ((t1 - last_report_at) > period) {
-            last_report_at = t;
-            if ((progress_failed == false) && (reporter != null)
-                && (reporter.progress() == false)) {
-              progress_failed = true;
-            }
-          }
-          if (o == BAD_WRITER) {
-            continue;
-          }
-          n++;
-          WriterAndPath wap = (WriterAndPath) o;
-          try {
-            wap.writerClosed = true;
-            wap.w.close();
-            LOG.debug("Closed " + wap.p);
-          } catch (IOException e) {
-            LOG.debug("Exception while closing the writer :", e);
-          }
-          Path dst = getCompletedRecoveredEditsFilePath(wap.p, outputSink
-              .getRegionMaximumEditLogSeqNum(logWritersEntry.getKey()));
-          if (!dst.equals(wap.p) && fs.exists(dst)) {
-            LOG.warn("Found existing old edits file. It could be the "
-                + "result of a previous failed split attempt. Deleting " + dst
-                + ", length=" + fs.getFileStatus(dst).getLen());
-            if (!fs.delete(dst, false)) {
-              LOG.warn("Failed deleting of old " + dst);
-              throw new IOException("Failed deleting of old " + dst);
-            }
-          }
-          // Skip the unit tests which create a splitter that reads and writes
-          // the
-          // data without touching disk. TestHLogSplit#testThreading is an
-          // example.
-          if (fs.exists(wap.p)) {
-            if (!fs.rename(wap.p, dst)) {
-              throw new IOException("Failed renaming " + wap.p + " to " + dst);
-            }
-            LOG.debug("Rename " + wap.p + " to " + dst);
-          }
-        }
-        allWritersClosed = true;
-        String msg = "Processed " + editsCount + " edits across " + n
-            + " regions" + " threw away edits for " + (logWriters.size() - n)
-            + " regions" + "; log file=" + logPath + " is corrupted = "
-            + isCorrupted + " progress failed = " + progress_failed;
+      LOG.info("Finishing writing output logs and closing down.");
+      progress_failed = outputSink.finishWritingAndClose() == null;
+      String msg = "Processed " + editsCount + " edits across "
+          + outputSink.getOutputCounts().size() + " regions; log file="
+          + logPath + " is corrupted = " + isCorrupted + " progress failed = "
+          + progress_failed;
+      ;
      LOG.info(msg);
      status.markComplete(msg);
-      } finally {
-        if (!allWritersClosed) {
-          for (Map.Entry<byte[], Object> logWritersEntry : logWriters.entrySet()) {
-            Object o = logWritersEntry.getValue();
-            if (o != BAD_WRITER) {
-              WriterAndPath wap = (WriterAndPath) o;
-              try {
-                if (!wap.writerClosed) {
-                  wap.writerClosed = true;
-                  wap.w.close();
-                }
-              } catch (IOException e) {
-                LOG.debug("Exception while closing the writer :", e);
-              }
-            }
-          }
-        }
-        in.close();
-      }
    }
    return !progress_failed;
  }
@ -1171,6 +1088,63 @@ public class HLogSplitter {
    return ret;
  }

+  /***
+   * @return false if it is a distributed log splitting and it failed to report
+   *         progress
+   */
+  private boolean reportProgressIfIsDistributedLogSplitting() {
+    if (this.distributedLogSplittingHelper != null) {
+      return distributedLogSplittingHelper.reportProgress();
+    } else {
+      return true;
+    }
+  }
+
+  /**
+   * A class used in distributed log splitting
+   * 
+   */
+  class DistributedLogSplittingHelper {
+    // Report progress, only used in distributed log splitting
+    private final CancelableProgressable splitReporter;
+    // How often to send a progress report (default 1/2 master timeout)
+    private final int report_period;
+    private long last_report_at = 0;
+    private final String tmpDirName;
+
+    public DistributedLogSplittingHelper(CancelableProgressable reporter,
+        String tmpName) {
+      this.splitReporter = reporter;
+      this.tmpDirName = tmpName;
+      report_period = conf.getInt("hbase.splitlog.report.period",
+          conf.getInt("hbase.splitlog.manager.timeout",
+              SplitLogManager.DEFAULT_TIMEOUT) / 2);
+    }
+
+    /***
+     * @return false if reporter failed progressing
+     */
+    private boolean reportProgress() {
+      if (splitReporter == null) {
+        return true;
+      } else {
+        long t = EnvironmentEdgeManager.currentTimeMillis();
+        if ((t - last_report_at) > report_period) {
+          last_report_at = t;
+          if (this.splitReporter.progress() == false) {
+            LOG.warn("Failed: reporter.progress asked us to terminate");
+            return false;
+          }
+        }
+        return true;
+      }
+    }
+
+    String getTmpDirName() {
+      return this.tmpDirName;
+    }
+  }
+
  /**
   * Class that manages the output streams from the log splitting process.
   */
@ -1189,20 +1163,23 @@ public class HLogSplitter {
    
    private boolean logWritersClosed  = false;

-    /**
-     * Start the threads that will pump data from the entryBuffers
-     * to the output files.
-     * @return the list of started threads
-     */
-    synchronized void startWriterThreads(EntryBuffers entryBuffers) {
+    private final int numThreads;
+
+    public OutputSink() {
      // More threads could potentially write faster at the expense
      // of causing more disk seeks as the logs are split.
      // 3. After a certain setting (probably around 3) the
      // process will be bound on the reader in the current
      // implementation anyway.
-      int numThreads = conf.getInt(
+      numThreads = conf.getInt(
          "hbase.regionserver.hlog.splitlog.writer.threads", 3);
+    }

+    /**
+     * Start the threads that will pump data from the entryBuffers
+     * to the output files.
+     */
+    synchronized void startWriterThreads() {
      for (int i = 0; i < numThreads; i++) {
        WriterThread t = new WriterThread(i);
        t.start();
@ -1210,22 +1187,35 @@ public class HLogSplitter {
      }
    }

+    /**
+     * 
+     * @return null if failed to report progress
+     * @throws IOException
+     */
    List<Path> finishWritingAndClose() throws IOException {
      LOG.info("Waiting for split writer threads to finish");
+      boolean progress_failed = false;
      try {
        for (WriterThread t : writerThreads) {
          t.finish();
        }
        for (WriterThread t : writerThreads) {
+          if (!progress_failed && !reportProgressIfIsDistributedLogSplitting()) {
+            progress_failed = true;
+          }
          try {
            t.join();
          } catch (InterruptedException ie) {
-            throw new IOException(ie);
+            IOException iie = new InterruptedIOException();
+            iie.initCause(ie);
+            throw iie;
          }
          checkForErrors();
        }
        LOG.info("Split writers finished");
-
+        if (progress_failed) {
+          return null;
+        }
        return closeStreams();
      } finally {
        List<IOException> thrown = closeLogWriters(null);
@ -1242,45 +1232,92 @@ public class HLogSplitter {
    private List<Path> closeStreams() throws IOException {
      Preconditions.checkState(!closeAndCleanCompleted);

-      List<Path> paths = new ArrayList<Path>();
-      List<IOException> thrown = Lists.newArrayList();
-      closeLogWriters(thrown);
-      for (Map.Entry<byte[], WriterAndPath> logWritersEntry : logWriters
+      final List<Path> paths = new ArrayList<Path>();
+      final List<IOException> thrown = Lists.newArrayList();
+      ThreadPoolExecutor closeThreadPool = Threads.getBoundedCachedThreadPool(
+          numThreads, 30L, TimeUnit.SECONDS, new ThreadFactory() {
+            private int count = 1;
+            public Thread newThread(Runnable r) {
+              Thread t = new Thread(r, "split-log-closeStream-" + count++);
+              return t;
+            }
+          });
+      CompletionService<Void> completionService = new ExecutorCompletionService<Void>(
+          closeThreadPool);
+      for (final Map.Entry<byte[], WriterAndPath> logWritersEntry : logWriters
          .entrySet()) {
+        completionService.submit(new Callable<Void>() {
+          public Void call() throws Exception {
            WriterAndPath wap = logWritersEntry.getValue();
+            try {
+              wap.w.close();
+            } catch (IOException ioe) {
+              LOG.error("Couldn't close log at " + wap.p, ioe);
+              thrown.add(ioe);
+              return null;
+            }
+            LOG.info("Closed path " + wap.p + " (wrote " + wap.editsWritten
+                + " edits in " + (wap.nanosSpent / 1000 / 1000) + "ms)");
            Path dst = getCompletedRecoveredEditsFilePath(wap.p,
                regionMaximumEditLogSeqNum.get(logWritersEntry.getKey()));
            try {
              if (!dst.equals(wap.p) && fs.exists(dst)) {
                LOG.warn("Found existing old edits file. It could be the "
-                + "result of a previous failed split attempt. Deleting " + dst
-                + ", length=" + fs.getFileStatus(dst).getLen());
+                    + "result of a previous failed split attempt. Deleting "
+                    + dst + ", length=" + fs.getFileStatus(dst).getLen());
                if (!fs.delete(dst, false)) {
                  LOG.warn("Failed deleting of old " + dst);
                  throw new IOException("Failed deleting of old " + dst);
                }
              }
-          // Skip the unit tests which create a splitter that reads and writes
-          // the data without touching disk. TestHLogSplit#testThreading is an
-          // example.
+              // Skip the unit tests which create a splitter that reads and
+              // writes the data without touching disk.
+              // TestHLogSplit#testThreading is an example.
              if (fs.exists(wap.p)) {
                if (!fs.rename(wap.p, dst)) {
-              throw new IOException("Failed renaming " + wap.p + " to " + dst);
+                  throw new IOException("Failed renaming " + wap.p + " to "
+                      + dst);
                }
                LOG.debug("Rename " + wap.p + " to " + dst);
              }
            } catch (IOException ioe) {
              LOG.error("Couldn't rename " + wap.p + " to " + dst, ioe);
              thrown.add(ioe);
-          continue;
+              return null;
            }
            paths.add(dst);
+            return null;
          }
+        });
+      }
+
+      boolean progress_failed = false;
+      try {
+        for (int i = 0; i < logWriters.size(); i++) {
+          Future<Void> future = completionService.take();
+          future.get();
+          if (!progress_failed && !reportProgressIfIsDistributedLogSplitting()) {
+            progress_failed = true;
+          }
+        }
+      } catch (InterruptedException e) {
+        IOException iie = new InterruptedIOException();
+        iie.initCause(e);
+        throw iie;
+      } catch (ExecutionException e) {
+        throw new IOException(e.getCause());
+      } finally {
+        closeThreadPool.shutdownNow();
+      }
+
      if (!thrown.isEmpty()) {
        throw MultipleIOException.createIOException(thrown);
      }
-
+      logWritersClosed = true;
      closeAndCleanCompleted = true;
+      if (progress_failed) {
+        return null;
+      }
      return paths;
    }
    
@ -1325,7 +1362,9 @@ public class HLogSplitter {
      if (blacklistedRegions.contains(region)) {
        return null;
      }
-      ret = createWAP(region, entry, rootDir, null, fs, conf);
+      String tmpName = distributedLogSplittingHelper == null ? null
+          : distributedLogSplittingHelper.getTmpDirName();
+      ret = createWAP(region, entry, rootDir, tmpName, fs, conf);
      if (ret == null) {
        blacklistedRegions.add(region);
        return null;
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestDistributedLogSplitting.java
@ -397,6 +397,9 @@ public class TestDistributedLogSplitting {
      List<HRegionInfo> hris, String tname,
      int num_edits, int edit_size) throws IOException {

+    // remove root and meta region
+    hris.remove(HRegionInfo.ROOT_REGIONINFO);
+    hris.remove(HRegionInfo.FIRST_META_REGIONINFO);
    byte[] table = Bytes.toBytes(tname);
    HTableDescriptor htd = new HTableDescriptor(tname);
    byte[] value = new byte[edit_size];