HDFS-14557. JournalNode error: Can't scan a pre-transactional edit log. Contributed by Stephen O'Donnell.

Signed-off-by: Wei-Chiu Chuang <weichiu@apache.org>
This commit is contained in:
Stephen O'Donnell 2019-08-06 13:52:16 -07:00 committed by Wei-Chiu Chuang
parent 69e3ac5cbb
commit 99af4ce0ac
4 changed files with 81 additions and 0 deletions

View File

@ -163,6 +163,16 @@ public class EditLogFileInputStream extends EditLogInputStream {
} catch (EOFException eofe) { } catch (EOFException eofe) {
throw new LogHeaderCorruptException("No header found in log"); throw new LogHeaderCorruptException("No header found in log");
} }
if (logVersion == -1) {
// The edits in progress file is pre-allocated with 1MB of "-1" bytes
// when it is created, then the header is written. If the header is
// -1, it indicates the an exception occurred pre-allocating the file
// and the header was never written. Therefore this is effectively a
// corrupt and empty log.
throw new LogHeaderCorruptException("No header present in log (value " +
"is -1), probably due to disk space issues when it was created. " +
"The log has no transactions and will be sidelined.");
}
// We assume future layout will also support ADD_LAYOUT_FLAGS // We assume future layout will also support ADD_LAYOUT_FLAGS
if (NameNodeLayoutVersion.supports( if (NameNodeLayoutVersion.supports(
LayoutVersion.Feature.ADD_LAYOUT_FLAGS, logVersion) || LayoutVersion.Feature.ADD_LAYOUT_FLAGS, logVersion) ||

View File

@ -1284,6 +1284,12 @@ public class FSEditLogLoader {
+ lastPos, t); + lastPos, t);
in.resync(); in.resync();
FSImage.LOG.warn("After resync, position is " + in.getPosition()); FSImage.LOG.warn("After resync, position is " + in.getPosition());
if (in.getPosition() <= lastPos) {
FSImage.LOG.warn("After resync, the position, " +
in.getPosition() + " is not greater than the previous " +
"position " + lastPos + ". Skipping remainder of this log.");
break;
}
continue; continue;
} }
if (lastTxId == HdfsServerConstants.INVALID_TXID || txid > lastTxId) { if (lastTxId == HdfsServerConstants.INVALID_TXID || txid > lastTxId) {

View File

@ -28,6 +28,7 @@ import java.io.ByteArrayOutputStream;
import java.io.DataOutputStream; import java.io.DataOutputStream;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.RandomAccessFile;
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileUtil; import org.apache.hadoop.fs.FileUtil;
@ -125,6 +126,48 @@ public class TestJournal {
Assert.assertEquals(1, segmentState.getStartTxId()); Assert.assertEquals(1, segmentState.getStartTxId());
} }
/**
* Test for HDFS-14557 to ensure that a edit file that failed to fully
* allocate and has a header byte of -1 is moved aside to allow startup
* to progress.
*/
@Test
public void testEmptyEditsInProgressMovedAside() throws Exception {
// First, write 5 transactions to the journal
journal.startLogSegment(makeRI(1), 1,
NameNodeLayoutVersion.CURRENT_LAYOUT_VERSION - 1);
final int numTxns = 5;
byte[] ops = QJMTestUtil.createTxnData(1, 5);
journal.journal(makeRI(2), 1, 1, numTxns, ops);
// Now close the segment
journal.finalizeLogSegment(makeRI(3), 1, numTxns);
// Create a new segment creating a new edits_inprogress file
journal.startLogSegment(makeRI(4), 6,
NameNodeLayoutVersion.CURRENT_LAYOUT_VERSION - 1);
ops = QJMTestUtil.createTxnData(6, 5);
journal.journal(makeRI(5), 6, 6, numTxns, ops);
File eip = journal.getStorage().getInProgressEditLog(6);
// Now stop the journal without finalizing the segment
journal.close();
// Now "zero out" the EIP file with -1 bytes, similar to how it would
// appear if the pre-allocation failed
RandomAccessFile rwf = new RandomAccessFile(eip, "rw");
for (int i=0; i<rwf.length(); i++) {
rwf.write(-1);
}
rwf.close();
// Finally start the Journal again, and ensure the "zeroed out" file
// is renamed with a .empty extension
journal = new Journal(conf, TEST_LOG_DIR, JID, StartupOption.REGULAR,
mockErrorReporter);
File movedTo = new File(eip.getAbsolutePath()+".empty");
assertTrue(movedTo.exists());
}
@Test (timeout = 10000) @Test (timeout = 10000)
public void testEpochHandling() throws Exception { public void testEpochHandling() throws Exception {
assertEquals(0, journal.getLastPromisedEpoch()); assertEquals(0, journal.getLastPromisedEpoch());

View File

@ -28,6 +28,7 @@ import java.io.ByteArrayInputStream;
import java.io.File; import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.io.RandomAccessFile; import java.io.RandomAccessFile;
import java.io.FileOutputStream;
import java.net.HttpURLConnection; import java.net.HttpURLConnection;
import java.net.URL; import java.net.URL;
import java.util.EnumMap; import java.util.EnumMap;
@ -160,4 +161,25 @@ public class TestEditLogFileInputStream {
} }
elis.close(); elis.close();
} }
/**
* Regression test for HDFS-14557 which verifies that an edit log filled
* with only "-1" bytes is moved aside and does not prevent the Journal
* node from starting.
*/
@Test(timeout=60000)
public void testScanEditThatFailedDuringPreAllocate() throws Exception {
Configuration conf = new Configuration();
File editLog = new File(GenericTestUtils.getTempPath("testCorruptEditLog"));
FileOutputStream os = new FileOutputStream(editLog);
for (int i=0; i<1024; i++) {
os.write(-1);
}
os.close();
FSEditLogLoader.EditLogValidation val =
EditLogFileInputStream.scanEditLog(editLog, 1234, false);
assertEquals(true, val.hasCorruptHeader());
assertEquals(HdfsServerConstants.INVALID_TXID, val.getEndTxId());
}
} }