https://issues.apache.org/jira/browse/AMQ-5703 - further tests and fixes. Ensure early eof can be identified when checking for corruption and skip corruption on replay when the checksum is invalid

This commit is contained in:
gtully 2015-04-16 14:58:45 +01:00
parent 7dc522d4c3
commit 73db4d2bfd
3 changed files with 351 additions and 7 deletions

View File

@ -603,10 +603,19 @@ public abstract class MessageDatabase extends ServiceSupport implements BrokerSe
int redoCounter = 0; int redoCounter = 0;
LOG.info("Recovering from the journal @" + recoveryPosition); LOG.info("Recovering from the journal @" + recoveryPosition);
while (recoveryPosition != null) { while (recoveryPosition != null) {
try {
JournalCommand<?> message = load(recoveryPosition); JournalCommand<?> message = load(recoveryPosition);
metadata.lastUpdate = recoveryPosition; metadata.lastUpdate = recoveryPosition;
process(message, recoveryPosition, lastIndoubtPosition); process(message, recoveryPosition, lastIndoubtPosition);
redoCounter++; redoCounter++;
} catch (IOException failedRecovery) {
if (isIgnoreMissingJournalfiles()) {
// track this dud location
journal.corruptRecoveryLocation(recoveryPosition);
} else {
throw failedRecovery;
}
}
recoveryPosition = journal.getNextLocation(recoveryPosition); recoveryPosition = journal.getNextLocation(recoveryPosition);
if (LOG.isInfoEnabled() && redoCounter % 100000 == 0) { if (LOG.isInfoEnabled() && redoCounter % 100000 == 0) {
LOG.info("@" + recoveryPosition + ", " + redoCounter + " entries recovered .."); LOG.info("@" + recoveryPosition + ", " + redoCounter + " entries recovered ..");
@ -826,8 +835,8 @@ public abstract class MessageDatabase extends ServiceSupport implements BrokerSe
} }
if (!missingPredicates.isEmpty()) { if (!missingPredicates.isEmpty()) {
for (StoredDestination sd : storedDestinations.values()) { for (Entry<String, StoredDestination> sdEntry : storedDestinations.entrySet()) {
final StoredDestination sd = sdEntry.getValue();
final ArrayList<Long> matches = new ArrayList<Long>(); final ArrayList<Long> matches = new ArrayList<Long>();
sd.locationIndex.visit(tx, new BTreeVisitor.OrVisitor<Location, Long>(missingPredicates) { sd.locationIndex.visit(tx, new BTreeVisitor.OrVisitor<Location, Long>(missingPredicates) {
@Override @Override
@ -847,6 +856,7 @@ public abstract class MessageDatabase extends ServiceSupport implements BrokerSe
MessageKeys keys = sd.orderIndex.remove(tx, sequenceId); MessageKeys keys = sd.orderIndex.remove(tx, sequenceId);
sd.locationIndex.remove(tx, keys.location); sd.locationIndex.remove(tx, keys.location);
sd.messageIdIndex.remove(tx, keys.messageId); sd.messageIdIndex.remove(tx, keys.messageId);
LOG.info("[" + sdEntry.getKey() + "] dropped: " + keys.messageId + " at corrupt location: " + keys.location);
undoCounter++; undoCounter++;
// TODO: do we need to modify the ack positions for the pub sub case? // TODO: do we need to modify the ack positions for the pub sub case?
} }

View File

@ -55,6 +55,28 @@ public class Journal {
public static final int BATCH_CONTROL_RECORD_SIZE = RECORD_HEAD_SPACE+BATCH_CONTROL_RECORD_MAGIC.length+4+8; public static final int BATCH_CONTROL_RECORD_SIZE = RECORD_HEAD_SPACE+BATCH_CONTROL_RECORD_MAGIC.length+4+8;
public static final byte[] BATCH_CONTROL_RECORD_HEADER = createBatchControlRecordHeader(); public static final byte[] BATCH_CONTROL_RECORD_HEADER = createBatchControlRecordHeader();
// tackle corruption when checksum is disabled or corrupt with zeros, minimise data loss
public void corruptRecoveryLocation(Location recoveryPosition) throws IOException {
DataFile dataFile = getDataFile(recoveryPosition);
// with corruption on recovery we have no faith in the content - slip to the next batch record or eof
DataFileAccessor reader = accessorPool.openDataFileAccessor(dataFile);
try {
int nextOffset = findNextBatchRecord(reader, recoveryPosition.getOffset() + 1);
Sequence sequence = new Sequence(recoveryPosition.getOffset(), nextOffset >= 0 ? nextOffset - 1 : dataFile.getLength() - 1);
LOG.info("Corrupt journal records found in '" + dataFile.getFile() + "' between offsets: " + sequence);
// skip corruption on getNextLocation
recoveryPosition.setOffset((int) sequence.getLast() + 1);
recoveryPosition.setSize(-1);
dataFile.corruptedBlocks.add(sequence);
} catch (IOException e) {
} finally {
accessorPool.closeDataFileAccessor(reader);
}
}
public enum PreallocationStrategy { public enum PreallocationStrategy {
SPARSE_FILE, SPARSE_FILE,
OS_KERNEL_COPY, OS_KERNEL_COPY,
@ -301,7 +323,7 @@ public class Journal {
try { try {
while( true ) { while( true ) {
int size = checkBatchRecord(reader, location.getOffset()); int size = checkBatchRecord(reader, location.getOffset());
if ( size>=0 ) { if ( size>=0 && location.getOffset()+BATCH_CONTROL_RECORD_SIZE+size <= dataFile.getLength()) {
location.setOffset(location.getOffset()+BATCH_CONTROL_RECORD_SIZE+size); location.setOffset(location.getOffset()+BATCH_CONTROL_RECORD_SIZE+size);
} else { } else {

View File

@ -0,0 +1,312 @@
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.activemq.store.kahadb;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import javax.jms.Connection;
import javax.jms.Destination;
import javax.jms.Message;
import javax.jms.MessageConsumer;
import javax.jms.MessageProducer;
import javax.jms.Session;
import org.apache.activemq.ActiveMQConnectionFactory;
import org.apache.activemq.broker.BrokerService;
import org.apache.activemq.command.ActiveMQQueue;
import org.apache.activemq.store.kahadb.disk.journal.DataFile;
import org.apache.activemq.store.kahadb.disk.journal.Journal;
import org.apache.activemq.util.ByteSequence;
import org.apache.activemq.util.IOHelper;
import org.apache.activemq.util.RecoverableRandomAccessFile;
import org.junit.After;
import org.junit.Test;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
public class JournalCorruptionEofIndexRecoveryTest {
private static final Logger LOG = LoggerFactory.getLogger(JournalCorruptionEofIndexRecoveryTest.class);
ActiveMQConnectionFactory cf = null;
BrokerService broker = null;
private final Destination destination = new ActiveMQQueue("Test");
private String connectionUri;
private KahaDBPersistenceAdapter adapter;
protected void startBroker() throws Exception {
doStartBroker(true, false);
}
protected void restartBroker(boolean whackIndex) throws Exception {
restartBroker(whackIndex, false);
}
protected void restartBroker(boolean whackIndex, boolean forceRecoverIndex) throws Exception {
File dataDir = broker.getPersistenceAdapter().getDirectory();
if (broker != null) {
broker.stop();
broker.waitUntilStopped();
}
if (whackIndex) {
File indexToDelete = new File(dataDir, "db.data");
LOG.info("Whacking index: " + indexToDelete);
indexToDelete.delete();
}
doStartBroker(false, forceRecoverIndex);
}
private void doStartBroker(boolean delete, boolean forceRecoverIndex) throws Exception {
broker = new BrokerService();
if (delete) {
IOHelper.deleteChildren(broker.getPersistenceAdapter().getDirectory());
IOHelper.delete(broker.getPersistenceAdapter().getDirectory());
}
broker.setPersistent(true);
broker.setUseJmx(true);
broker.addConnector("tcp://localhost:0");
configurePersistence(broker, forceRecoverIndex);
connectionUri = "vm://localhost?create=false";
cf = new ActiveMQConnectionFactory(connectionUri);
broker.start();
LOG.info("Starting broker..");
}
protected void configurePersistence(BrokerService brokerService, boolean forceRecoverIndex) throws Exception {
adapter = (KahaDBPersistenceAdapter) brokerService.getPersistenceAdapter();
adapter.setForceRecoverIndex(forceRecoverIndex);
// ensure there are a bunch of data files but multiple entries in each
adapter.setJournalMaxFileLength(1024 * 20);
// speed up the test case, checkpoint an cleanup early and often
adapter.setCheckpointInterval(5000);
adapter.setCleanupInterval(5000);
adapter.setCheckForCorruptJournalFiles(true);
adapter.setIgnoreMissingJournalfiles(true);
}
@After
public void tearDown() throws Exception {
if (broker != null) {
broker.stop();
broker.waitUntilStopped();
}
}
@Test
public void testRecoveryAfterCorruptionEof() throws Exception {
startBroker();
produceMessagesToConsumeMultipleDataFiles(50);
int numFiles = getNumberOfJournalFiles();
assertTrue("more than x files: " + numFiles, numFiles > 2);
corruptBatchEndEof(3);
restartBroker(false);
assertEquals("missing one message", 49, broker.getAdminView().getTotalMessageCount());
assertEquals("Drain", 49, drainQueue(49));
}
@Test
public void testRecoveryAfterCorruptionCheckSum() throws Exception {
startBroker();
produceMessagesToConsumeMultipleDataFiles(4);
corruptBatchCheckSumSplash(1);
restartBroker(true);
assertEquals("missing one message", 3, broker.getAdminView().getTotalMessageCount());
assertEquals("Drain", 3, drainQueue(4));
}
@Test
public void testRecoveryAfterCorruptionCheckSumExistingIndex() throws Exception {
startBroker();
produceMessagesToConsumeMultipleDataFiles(4);
corruptBatchCheckSumSplash(1);
restartBroker(false);
assertEquals("unnoticed", 4, broker.getAdminView().getTotalMessageCount());
assertEquals("Drain", 0, drainQueue(4));
// force recover index and loose one message
restartBroker(false, true);
assertEquals("missing one index recreation", 3, broker.getAdminView().getTotalMessageCount());
assertEquals("Drain", 3, drainQueue(4));
}
private void corruptBatchCheckSumSplash(int id) throws Exception{
Collection<DataFile> files =
((KahaDBPersistenceAdapter) broker.getPersistenceAdapter()).getStore().getJournal().getFileMap().values();
DataFile dataFile = (DataFile) files.toArray()[0];
RecoverableRandomAccessFile randomAccessFile = dataFile.openRandomAccessFile();
ArrayList<Integer> batchPositions = findBatch(randomAccessFile, Integer.MAX_VALUE);
LOG.info("Batch positions: " + batchPositions);
int pos = batchPositions.get(1);
LOG.info("corrupting checksum and size (to push it past eof) of batch record at:" + id + "-" + pos);
randomAccessFile.seek(pos + Journal.BATCH_CONTROL_RECORD_HEADER.length + 4);
// whack the batch control record checksum
randomAccessFile.writeLong(0l);
// mod the data size in the location header so reading blows
randomAccessFile.seek(pos + Journal.BATCH_CONTROL_RECORD_SIZE);
int size = randomAccessFile.readInt();
byte type = randomAccessFile.readByte();
LOG.info("Read: size:" + size + ", type:" + type);
randomAccessFile.seek(pos + Journal.BATCH_CONTROL_RECORD_SIZE);
size -= 1;
LOG.info("rewrite incorrect location size @:" + (pos + Journal.BATCH_CONTROL_RECORD_SIZE) + " as: " + size);
randomAccessFile.writeInt(size);
randomAccessFile.getChannel().force(true);
}
private void corruptBatchEndEof(int id) throws Exception{
Collection<DataFile> files =
((KahaDBPersistenceAdapter) broker.getPersistenceAdapter()).getStore().getJournal().getFileMap().values();
DataFile dataFile = (DataFile) files.toArray()[id];
RecoverableRandomAccessFile randomAccessFile = dataFile.openRandomAccessFile();
ArrayList<Integer> batchPositions = findBatch(randomAccessFile, Integer.MAX_VALUE);
int pos = batchPositions.get(batchPositions.size() - 3);
LOG.info("corrupting checksum and size (to push it past eof) of batch record at:" + id + "-" + pos);
randomAccessFile.seek(pos + Journal.BATCH_CONTROL_RECORD_HEADER.length);
randomAccessFile.writeInt(31 * 1024 * 1024);
randomAccessFile.writeLong(0l);
randomAccessFile.getChannel().force(true);
}
private ArrayList<Integer> findBatch(RecoverableRandomAccessFile randomAccessFile, int where) throws IOException {
final ArrayList<Integer> batchPositions = new ArrayList<Integer>();
final ByteSequence header = new ByteSequence(Journal.BATCH_CONTROL_RECORD_HEADER);
byte data[] = new byte[1024 * 20];
ByteSequence bs = new ByteSequence(data, 0, randomAccessFile.read(data, 0, data.length));
int pos = 0;
for (int i = 0; i < where; i++) {
int found = bs.indexOf(header, pos);
if (found == -1) {
break;
}
batchPositions.add(found);
pos = found + Journal.BATCH_CONTROL_RECORD_HEADER.length - 1;
}
return batchPositions;
}
private int getNumberOfJournalFiles() throws IOException {
Collection<DataFile> files =
((KahaDBPersistenceAdapter) broker.getPersistenceAdapter()).getStore().getJournal().getFileMap().values();
int reality = 0;
for (DataFile file : files) {
if (file != null) {
reality++;
}
}
return reality;
}
private int produceMessages(Destination destination, int numToSend) throws Exception {
int sent = 0;
Connection connection = new ActiveMQConnectionFactory(
broker.getTransportConnectors().get(0).getConnectUri()).createConnection();
connection.start();
try {
Session session = connection.createSession(false, Session.AUTO_ACKNOWLEDGE);
MessageProducer producer = session.createProducer(destination);
for (int i = 0; i < numToSend; i++) {
producer.send(createMessage(session, i));
sent++;
}
} finally {
connection.close();
}
return sent;
}
private int produceMessagesToConsumeMultipleDataFiles(int numToSend) throws Exception {
return produceMessages(destination, numToSend);
}
final String payload = new String(new byte[1024]);
private Message createMessage(Session session, int i) throws Exception {
return session.createTextMessage(payload + "::" + i);
}
private int drainQueue(int max) throws Exception {
Connection connection = cf.createConnection();
connection.start();
Session session = connection.createSession(false, Session.AUTO_ACKNOWLEDGE);
MessageConsumer consumer = session.createConsumer(destination);
int count = 0;
while (count < max && consumer.receive(5000) != null) {
count++;
}
consumer.close();
connection.close();
return count;
}
}