From 022ef75950cb9b4e9f372ce309ae312c36526eef Mon Sep 17 00:00:00 2001 From: Zhihong Yu Date: Thu, 17 May 2012 15:55:59 +0000 Subject: [PATCH] HBASE-5826 Improve sync of HLog edits (Todd) git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1339671 13f79535-47bb-0310-9956-ffa450edef68 --- .../hadoop/hbase/regionserver/wal/HLog.java | 181 +++++++++++------- .../wal/SequenceFileLogWriter.java | 2 +- .../hbase/regionserver/wal/TestHLog.java | 10 +- 3 files changed, 114 insertions(+), 79 deletions(-) diff --git a/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLog.java b/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLog.java index 65cb20975a1..147867faeaa 100644 --- a/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLog.java +++ b/src/main/java/org/apache/hadoop/hbase/regionserver/wal/HLog.java @@ -48,6 +48,7 @@ import java.util.concurrent.locks.ReentrantLock; import java.util.regex.Matcher; import java.util.regex.Pattern; +import com.google.common.base.Charsets; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.classification.InterfaceAudience; @@ -139,8 +140,10 @@ public class HLog implements Syncable { private final long optionalFlushInterval; private final long blocksize; private final String prefix; - private final AtomicLong unflushedEntries = new AtomicLong(0); - private volatile long syncedTillHere = 0; + + /** tracking information about what has been synced */ + private SyncInfo syncInfo = new SyncInfo(); + private long lastDeferredTxid; private final Path oldLogDir; private volatile boolean logRollRunning; @@ -230,7 +233,6 @@ public class HLog implements Syncable { // during an update // locked during appends private final Object updateLock = new Object(); - private final Object flushLock = new Object(); private final boolean enabled; @@ -298,6 +300,7 @@ public class HLog implements Syncable { private static Metric writeSize = new Metric(); // For measuring latency of syncs private static Metric syncTime = new Metric(); + private static AtomicLong syncBatchSize = new AtomicLong(); //For measuring slow HLog appends private static AtomicLong slowHLogAppendCount = new AtomicLong(); private static Metric slowHLogAppendTime = new Metric(); @@ -314,6 +317,10 @@ public class HLog implements Syncable { return syncTime.get(); } + public static long getSyncBatchSize() { + return syncBatchSize.getAndSet(0); + } + public static long getSlowAppendCount() { return slowHLogAppendCount.get(); } @@ -833,17 +840,11 @@ public class HLog implements Syncable { try { // Wait till all current transactions are written to the hlog. // No new transactions can occur because we have the updatelock. - if (this.unflushedEntries.get() != this.syncedTillHere) { - LOG.debug("cleanupCurrentWriter " + - " waiting for transactions to get synced " + - " total " + this.unflushedEntries.get() + - " synced till here " + syncedTillHere); - sync(); - } + sync(); this.writer.close(); this.writer = null; closeErrorCount.set(0); - } catch (IOException e) { + } catch (Exception e) { LOG.error("Failed close of HLog writer", e); int errors = closeErrorCount.incrementAndGet(); if (errors <= closeErrorsTolerated && !hasDeferredEntries()) { @@ -1006,7 +1007,7 @@ public class HLog implements Syncable { * @param logEdit * @param logKey * @param doSync shall we sync after writing the transaction - * @return The txid of this transaction + * @return The seqnum of this transaction * @throws IOException */ public long append(HRegionInfo regionInfo, HLogKey logKey, WALEdit logEdit, @@ -1015,9 +1016,9 @@ public class HLog implements Syncable { if (this.closed) { throw new IOException("Cannot append; log is closed"); } - long txid = 0; + long seqNum; synchronized (updateLock) { - long seqNum = obtainSeqNum(); + seqNum = obtainSeqNum(); logKey.setLogSeqNum(seqNum); // The 'lastSeqWritten' map holds the sequence number of the oldest // write for each region (i.e. the first edit added to the particular @@ -1027,10 +1028,9 @@ public class HLog implements Syncable { this.lastSeqWritten.putIfAbsent(regionInfo.getEncodedNameAsBytes(), Long.valueOf(seqNum)); doWrite(regionInfo, logKey, logEdit, htd); - txid = this.unflushedEntries.incrementAndGet(); this.numEntries.incrementAndGet(); if (htd.isDeferredLogFlush()) { - lastDeferredTxid = txid; + lastDeferredTxid = seqNum; } } @@ -1040,9 +1040,9 @@ public class HLog implements Syncable { (regionInfo.isMetaRegion() || !htd.isDeferredLogFlush())) { // sync txn to file system - this.sync(txid); + this.sync(seqNum); } - return txid; + return seqNum; } /** @@ -1090,13 +1090,13 @@ public class HLog implements Syncable { private long append(HRegionInfo info, byte [] tableName, WALEdit edits, UUID clusterId, final long now, HTableDescriptor htd, boolean doSync) throws IOException { - if (edits.isEmpty()) return this.unflushedEntries.get();; + if (edits.isEmpty()) return this.logSeqNum.get(); if (this.closed) { throw new IOException("Cannot append; log is closed"); } - long txid = 0; + long seqNum; synchronized (this.updateLock) { - long seqNum = obtainSeqNum(); + seqNum = obtainSeqNum(); // The 'lastSeqWritten' map holds the sequence number of the oldest // write for each region (i.e. the first edit added to the particular // memstore). . When the cache is flushed, the entry for the @@ -1109,9 +1109,8 @@ public class HLog implements Syncable { HLogKey logKey = makeKey(encodedRegionName, tableName, seqNum, now, clusterId); doWrite(info, logKey, edits, htd); this.numEntries.incrementAndGet(); - txid = this.unflushedEntries.incrementAndGet(); if (htd.isDeferredLogFlush()) { - lastDeferredTxid = txid; + lastDeferredTxid = seqNum; } } // Sync if catalog region, and if not then check if that table supports @@ -1120,9 +1119,9 @@ public class HLog implements Syncable { (info.isMetaRegion() || !htd.isDeferredLogFlush())) { // sync txn to file system - this.sync(txid); + this.sync(seqNum); } - return txid; + return seqNum; } /** @@ -1180,6 +1179,9 @@ public class HLog implements Syncable { // goal is to increase the batchsize for writing-to-hdfs as well as // sync-to-hdfs, so that we can get better system throughput. private List pendingWrites = new LinkedList(); + long lastSeqAppended = -1; + long lastSeqFlushed = -1; + private Object flushLock = new Object(); LogSyncer(long optionalFlushInterval) { this.optionalFlushInterval = optionalFlushInterval; @@ -1193,7 +1195,7 @@ public class HLog implements Syncable { while(!this.isInterrupted() && !closeLogSyncer) { try { - if (unflushedEntries.get() <= syncedTillHere) { + if (syncInfo.getLastSyncedTxId() >= logSeqNum.get()) { Thread.sleep(this.optionalFlushInterval); } sync(); @@ -1213,24 +1215,45 @@ public class HLog implements Syncable { // our own queue rather than writing it to the HDFS output stream because // HDFSOutputStream.writeChunk is not lightweight at all. synchronized void append(Entry e) throws IOException { + long seq = e.getKey().getLogSeqNum(); + assert seq > lastSeqAppended; + lastSeqAppended = seq; pendingWrites.add(e); } // Returns all currently pending writes. New writes // will accumulate in a new list. - synchronized List getPendingWrites() { - List save = this.pendingWrites; - this.pendingWrites = new LinkedList(); - return save; - } + long flushWritesTo(Writer writer) throws IOException { + synchronized (flushLock) { + List pending; - // writes out pending entries to the HLog - void hlogFlush(Writer writer, List pending) throws IOException { - if (pending == null) return; + synchronized (this) { + pending = pendingWrites; + pendingWrites = new LinkedList(); + } - // write out all accumulated Entries to hdfs. - for (Entry e : pending) { - writer.append(e); + boolean success = false; + try { + int numFlushed = 0; + for (Entry e : pending) { + writer.append(e); + long seq = e.getKey().getLogSeqNum(); + assert seq > lastSeqFlushed; + lastSeqFlushed = seq; + numFlushed++; + } + syncBatchSize.addAndGet(numFlushed); + success = true; + } finally { + if (!success) { + // push back our batch into the pending list + synchronized (this) { + pending.addAll(pendingWrites); + pendingWrites = pending; + } + } + } + return lastSeqFlushed; } } @@ -1239,9 +1262,31 @@ public class HLog implements Syncable { } } + private static class SyncInfo { + private long syncedTillHere = 0; + + synchronized long getLastSyncedTxId() { + return syncedTillHere; + } + + synchronized void notifySynced(long txid) { + if (txid > syncedTillHere) { + syncedTillHere = txid; + } + notifyAll(); + } + + synchronized void waitForSync(long txid) throws InterruptedException { + while (syncedTillHere < txid) { + wait(); + } + } + } + + // sync all known transactions private void syncer() throws IOException { - syncer(this.unflushedEntries.get()); // sync all pending items + syncer(logSeqNum.get()); // sync all pending } // sync all transactions upto the specified txid @@ -1253,47 +1298,26 @@ public class HLog implements Syncable { } // if the transaction that we are interested in is already // synced, then return immediately. - if (txid <= this.syncedTillHere) { + if (syncInfo.getLastSyncedTxId() >= txid) { return; } try { - long doneUpto; long now = System.currentTimeMillis(); - // First flush all the pending writes to HDFS. Then - // issue the sync to HDFS. If sync is successful, then update - // syncedTillHere to indicate that transactions till this - // number has been successfully synced. - synchronized (flushLock) { - if (txid <= this.syncedTillHere) { - return; - } - doneUpto = this.unflushedEntries.get(); - List pending = logSyncerThread.getPendingWrites(); - try { - logSyncerThread.hlogFlush(tempWriter, pending); - } catch(IOException io) { - synchronized (this.updateLock) { - // HBASE-4387, HBASE-5623, retry with updateLock held - tempWriter = this.writer; - logSyncerThread.hlogFlush(tempWriter, pending); - } - } - } - // another thread might have sync'ed avoid double-sync'ing - if (txid <= this.syncedTillHere) { - return; - } + long flushedSeqId; try { + flushedSeqId = logSyncerThread.flushWritesTo(tempWriter); tempWriter.sync(); } catch(IOException io) { synchronized (this.updateLock) { // HBASE-4387, HBASE-5623, retry with updateLock held tempWriter = this.writer; + flushedSeqId = logSyncerThread.flushWritesTo(tempWriter); tempWriter.sync(); } } - this.syncedTillHere = Math.max(this.syncedTillHere, doneUpto); - + syncInfo.notifySynced(flushedSeqId); + // We try to not acquire the updateLock just to update statistics. + // Make these statistics as AtomicLong. syncTime.inc(System.currentTimeMillis() - now); if (!this.logRollRunning) { checkLowReplication(); @@ -1548,14 +1572,15 @@ public class HLog implements Syncable { if (this.closed) { return; } - long txid = 0; + long seqNumOfCompletionEdit; synchronized (updateLock) { + seqNumOfCompletionEdit = obtainSeqNum(); long now = System.currentTimeMillis(); - WALEdit edit = completeCacheFlushLogEdit(); - HLogKey key = makeKey(encodedRegionName, tableName, logSeqId, + + WALEdit edit = completeCacheFlushLogEdit(logSeqId); + HLogKey key = makeKey(encodedRegionName, tableName, seqNumOfCompletionEdit, System.currentTimeMillis(), HConstants.DEFAULT_CLUSTER_ID); logSyncerThread.append(new Entry(key, edit)); - txid = this.unflushedEntries.incrementAndGet(); writeTime.inc(System.currentTimeMillis() - now); long len = 0; for (KeyValue kv : edit.getKeyValues()) { @@ -1565,7 +1590,7 @@ public class HLog implements Syncable { this.numEntries.incrementAndGet(); } // sync txn to file system - this.sync(txid); + this.sync(seqNumOfCompletionEdit); } finally { // updateLock not needed for removing snapshot's entry @@ -1576,9 +1601,17 @@ public class HLog implements Syncable { } } - private WALEdit completeCacheFlushLogEdit() { + private WALEdit completeCacheFlushLogEdit(long seqIdOfFlush) { + // The data is not actually used here - we just need to write + // something to the log to make sure we're still the owner of the + // pipeline. + byte[] data = Bytes.add( + COMPLETE_CACHE_FLUSH, + ":".getBytes(Charsets.UTF_8), + Bytes.toBytes(String.valueOf(seqIdOfFlush))); + KeyValue kv = new KeyValue(METAROW, METAFAMILY, null, - System.currentTimeMillis(), COMPLETE_CACHE_FLUSH); + System.currentTimeMillis(), data); WALEdit e = new WALEdit(); e.add(kv); return e; @@ -1855,7 +1888,7 @@ public class HLog implements Syncable { /** Provide access to currently deferred sequence num for tests */ boolean hasDeferredEntries() { - return lastDeferredTxid > syncedTillHere; + return lastDeferredTxid > syncInfo.getLastSyncedTxId(); } /** diff --git a/src/main/java/org/apache/hadoop/hbase/regionserver/wal/SequenceFileLogWriter.java b/src/main/java/org/apache/hadoop/hbase/regionserver/wal/SequenceFileLogWriter.java index 5e810c0df80..8abb7f1a3fd 100644 --- a/src/main/java/org/apache/hadoop/hbase/regionserver/wal/SequenceFileLogWriter.java +++ b/src/main/java/org/apache/hadoop/hbase/regionserver/wal/SequenceFileLogWriter.java @@ -245,7 +245,7 @@ public class SequenceFileLogWriter implements HLog.Writer { @Override public void sync() throws IOException { - this.writer.syncFs(); + if (this.writer != null) this.writer.syncFs(); } @Override diff --git a/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestHLog.java b/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestHLog.java index 7c3cb70e094..93ded443f49 100644 --- a/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestHLog.java +++ b/src/test/java/org/apache/hadoop/hbase/regionserver/wal/TestHLog.java @@ -548,8 +548,9 @@ public class TestHLog { KeyValue kv = val.getKeyValues().get(0); assertTrue(Bytes.equals(HLog.METAROW, kv.getRow())); assertTrue(Bytes.equals(HLog.METAFAMILY, kv.getFamily())); - assertEquals(0, Bytes.compareTo(HLog.COMPLETE_CACHE_FLUSH, - val.getKeyValues().get(0).getValue())); + assertTrue(Bytes.startsWith( + val.getKeyValues().get(0).getValue(), + HLog.COMPLETE_CACHE_FLUSH)); System.out.println(key + " " + val); } } finally { @@ -616,8 +617,9 @@ public class TestHLog { assertTrue(Bytes.equals(tableName, entry.getKey().getTablename())); assertTrue(Bytes.equals(HLog.METAROW, val.getRow())); assertTrue(Bytes.equals(HLog.METAFAMILY, val.getFamily())); - assertEquals(0, Bytes.compareTo(HLog.COMPLETE_CACHE_FLUSH, - val.getValue())); + assertTrue(Bytes.startsWith( + val.getValue(), + HLog.COMPLETE_CACHE_FLUSH)); System.out.println(entry.getKey() + " " + val); } } finally {