From b0a2dc9c8471337182162746d9298f08b39a5566 Mon Sep 17 00:00:00 2001 From: Kihwal Lee Date: Tue, 14 Jul 2015 14:07:38 -0500 Subject: [PATCH] HDFS-8722. Optimize datanode writes for small writes and flushes. Contributed by Kihwal Lee (cherry picked from commit 59388a801514d6af64ef27fbf246d8054f1dcc74) --- hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt | 2 ++ .../hdfs/server/datanode/BlockReceiver.java | 34 ++++++++++++------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt index 434f4872908..403ed062482 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt +++ b/hadoop-hdfs-project/hadoop-hdfs/CHANGES.txt @@ -717,6 +717,8 @@ Release 2.7.2 - UNRELEASED OPTIMIZATIONS + HDFS-8722. Optimize datanode writes for small writes and flushes (kihwal) + BUG FIXES Release 2.7.1 - 2015-07-06 diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java index 2468f432e28..55c9d572c82 100644 --- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java +++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/datanode/BlockReceiver.java @@ -598,14 +598,19 @@ class BlockReceiver implements Closeable { // bytes should be skipped when writing the data and checksum // buffers out to disk. long partialChunkSizeOnDisk = onDiskLen % bytesPerChecksum; + long lastChunkBoundary = onDiskLen - partialChunkSizeOnDisk; boolean alignedOnDisk = partialChunkSizeOnDisk == 0; boolean alignedInPacket = firstByteInBlock % bytesPerChecksum == 0; - // Since data is always appended, not overwritten, partial CRC - // recalculation is necessary if the on-disk data is not chunk- - // aligned, regardless of whether the beginning of the data in - // the packet is chunk-aligned. - boolean doPartialCrc = !alignedOnDisk && !shouldNotWriteChecksum; + // If the end of the on-disk data is not chunk-aligned, the last + // checksum needs to be overwritten. + boolean overwriteLastCrc = !alignedOnDisk && !shouldNotWriteChecksum; + // If the starting offset of the packat data is at the last chunk + // boundary of the data on disk, the partial checksum recalculation + // can be skipped and the checksum supplied by the client can be used + // instead. This reduces disk reads and cpu load. + boolean doCrcRecalc = overwriteLastCrc && + (lastChunkBoundary != firstByteInBlock); // If this is a partial chunk, then verify that this is the only // chunk in the packet. If the starting offset is not chunk @@ -621,9 +626,10 @@ class BlockReceiver implements Closeable { // If the last portion of the block file is not a full chunk, // then read in pre-existing partial data chunk and recalculate // the checksum so that the checksum calculation can continue - // from the right state. + // from the right state. If the client provided the checksum for + // the whole chunk, this is not necessary. Checksum partialCrc = null; - if (doPartialCrc) { + if (doCrcRecalc) { if (LOG.isDebugEnabled()) { LOG.debug("receivePacket for " + block + ": previous write did not end at the chunk boundary." @@ -659,8 +665,15 @@ class BlockReceiver implements Closeable { int skip = 0; byte[] crcBytes = null; - // First, overwrite the partial crc at the end, if necessary. - if (doPartialCrc) { // not chunk-aligned on disk + // First, prepare to overwrite the partial crc at the end. + if (overwriteLastCrc) { // not chunk-aligned on disk + // prepare to overwrite last checksum + adjustCrcFilePosition(); + } + + // The CRC was recalculated for the last partial chunk. Update the + // CRC by reading the rest of the chunk, then write it out. + if (doCrcRecalc) { // Calculate new crc for this chunk. int bytesToReadForRecalc = (int)(bytesPerChecksum - partialChunkSizeOnDisk); @@ -673,8 +686,6 @@ class BlockReceiver implements Closeable { byte[] buf = FSOutputSummer.convertToByteStream(partialCrc, checksumSize); crcBytes = copyLastChunkChecksum(buf, checksumSize, buf.length); - // prepare to overwrite last checksum - adjustCrcFilePosition(); checksumOut.write(buf); if(LOG.isDebugEnabled()) { LOG.debug("Writing out partial crc for data len " + len + @@ -687,7 +698,6 @@ class BlockReceiver implements Closeable { // boundary. The checksum after the boundary was already counted // above. Only count the number of checksums skipped up to the // boundary here. - long lastChunkBoundary = onDiskLen - (onDiskLen%bytesPerChecksum); long skippedDataBytes = lastChunkBoundary - firstByteInBlock; if (skippedDataBytes > 0) {