mirror of https://github.com/apache/activemq.git
https://issues.apache.org/jira/browse/AMQ-5703 - further tests and fixes. Ensure early eof can be identified when checking for corruption and skip corruption on replay when the checksum is invalid
This commit is contained in:
parent
7dc522d4c3
commit
73db4d2bfd
|
@ -603,10 +603,19 @@ public abstract class MessageDatabase extends ServiceSupport implements BrokerSe
|
||||||
int redoCounter = 0;
|
int redoCounter = 0;
|
||||||
LOG.info("Recovering from the journal @" + recoveryPosition);
|
LOG.info("Recovering from the journal @" + recoveryPosition);
|
||||||
while (recoveryPosition != null) {
|
while (recoveryPosition != null) {
|
||||||
|
try {
|
||||||
JournalCommand<?> message = load(recoveryPosition);
|
JournalCommand<?> message = load(recoveryPosition);
|
||||||
metadata.lastUpdate = recoveryPosition;
|
metadata.lastUpdate = recoveryPosition;
|
||||||
process(message, recoveryPosition, lastIndoubtPosition);
|
process(message, recoveryPosition, lastIndoubtPosition);
|
||||||
redoCounter++;
|
redoCounter++;
|
||||||
|
} catch (IOException failedRecovery) {
|
||||||
|
if (isIgnoreMissingJournalfiles()) {
|
||||||
|
// track this dud location
|
||||||
|
journal.corruptRecoveryLocation(recoveryPosition);
|
||||||
|
} else {
|
||||||
|
throw failedRecovery;
|
||||||
|
}
|
||||||
|
}
|
||||||
recoveryPosition = journal.getNextLocation(recoveryPosition);
|
recoveryPosition = journal.getNextLocation(recoveryPosition);
|
||||||
if (LOG.isInfoEnabled() && redoCounter % 100000 == 0) {
|
if (LOG.isInfoEnabled() && redoCounter % 100000 == 0) {
|
||||||
LOG.info("@" + recoveryPosition + ", " + redoCounter + " entries recovered ..");
|
LOG.info("@" + recoveryPosition + ", " + redoCounter + " entries recovered ..");
|
||||||
|
@ -826,8 +835,8 @@ public abstract class MessageDatabase extends ServiceSupport implements BrokerSe
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!missingPredicates.isEmpty()) {
|
if (!missingPredicates.isEmpty()) {
|
||||||
for (StoredDestination sd : storedDestinations.values()) {
|
for (Entry<String, StoredDestination> sdEntry : storedDestinations.entrySet()) {
|
||||||
|
final StoredDestination sd = sdEntry.getValue();
|
||||||
final ArrayList<Long> matches = new ArrayList<Long>();
|
final ArrayList<Long> matches = new ArrayList<Long>();
|
||||||
sd.locationIndex.visit(tx, new BTreeVisitor.OrVisitor<Location, Long>(missingPredicates) {
|
sd.locationIndex.visit(tx, new BTreeVisitor.OrVisitor<Location, Long>(missingPredicates) {
|
||||||
@Override
|
@Override
|
||||||
|
@ -847,6 +856,7 @@ public abstract class MessageDatabase extends ServiceSupport implements BrokerSe
|
||||||
MessageKeys keys = sd.orderIndex.remove(tx, sequenceId);
|
MessageKeys keys = sd.orderIndex.remove(tx, sequenceId);
|
||||||
sd.locationIndex.remove(tx, keys.location);
|
sd.locationIndex.remove(tx, keys.location);
|
||||||
sd.messageIdIndex.remove(tx, keys.messageId);
|
sd.messageIdIndex.remove(tx, keys.messageId);
|
||||||
|
LOG.info("[" + sdEntry.getKey() + "] dropped: " + keys.messageId + " at corrupt location: " + keys.location);
|
||||||
undoCounter++;
|
undoCounter++;
|
||||||
// TODO: do we need to modify the ack positions for the pub sub case?
|
// TODO: do we need to modify the ack positions for the pub sub case?
|
||||||
}
|
}
|
||||||
|
|
|
@ -55,6 +55,28 @@ public class Journal {
|
||||||
public static final int BATCH_CONTROL_RECORD_SIZE = RECORD_HEAD_SPACE+BATCH_CONTROL_RECORD_MAGIC.length+4+8;
|
public static final int BATCH_CONTROL_RECORD_SIZE = RECORD_HEAD_SPACE+BATCH_CONTROL_RECORD_MAGIC.length+4+8;
|
||||||
public static final byte[] BATCH_CONTROL_RECORD_HEADER = createBatchControlRecordHeader();
|
public static final byte[] BATCH_CONTROL_RECORD_HEADER = createBatchControlRecordHeader();
|
||||||
|
|
||||||
|
// tackle corruption when checksum is disabled or corrupt with zeros, minimise data loss
|
||||||
|
public void corruptRecoveryLocation(Location recoveryPosition) throws IOException {
|
||||||
|
DataFile dataFile = getDataFile(recoveryPosition);
|
||||||
|
// with corruption on recovery we have no faith in the content - slip to the next batch record or eof
|
||||||
|
DataFileAccessor reader = accessorPool.openDataFileAccessor(dataFile);
|
||||||
|
try {
|
||||||
|
int nextOffset = findNextBatchRecord(reader, recoveryPosition.getOffset() + 1);
|
||||||
|
Sequence sequence = new Sequence(recoveryPosition.getOffset(), nextOffset >= 0 ? nextOffset - 1 : dataFile.getLength() - 1);
|
||||||
|
LOG.info("Corrupt journal records found in '" + dataFile.getFile() + "' between offsets: " + sequence);
|
||||||
|
|
||||||
|
// skip corruption on getNextLocation
|
||||||
|
recoveryPosition.setOffset((int) sequence.getLast() + 1);
|
||||||
|
recoveryPosition.setSize(-1);
|
||||||
|
|
||||||
|
dataFile.corruptedBlocks.add(sequence);
|
||||||
|
|
||||||
|
} catch (IOException e) {
|
||||||
|
} finally {
|
||||||
|
accessorPool.closeDataFileAccessor(reader);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
public enum PreallocationStrategy {
|
public enum PreallocationStrategy {
|
||||||
SPARSE_FILE,
|
SPARSE_FILE,
|
||||||
OS_KERNEL_COPY,
|
OS_KERNEL_COPY,
|
||||||
|
@ -301,7 +323,7 @@ public class Journal {
|
||||||
try {
|
try {
|
||||||
while( true ) {
|
while( true ) {
|
||||||
int size = checkBatchRecord(reader, location.getOffset());
|
int size = checkBatchRecord(reader, location.getOffset());
|
||||||
if ( size>=0 ) {
|
if ( size>=0 && location.getOffset()+BATCH_CONTROL_RECORD_SIZE+size <= dataFile.getLength()) {
|
||||||
location.setOffset(location.getOffset()+BATCH_CONTROL_RECORD_SIZE+size);
|
location.setOffset(location.getOffset()+BATCH_CONTROL_RECORD_SIZE+size);
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,312 @@
|
||||||
|
/**
|
||||||
|
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||||
|
* contributor license agreements. See the NOTICE file distributed with
|
||||||
|
* this work for additional information regarding copyright ownership.
|
||||||
|
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||||
|
* (the "License"); you may not use this file except in compliance with
|
||||||
|
* the License. You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package org.apache.activemq.store.kahadb;
|
||||||
|
|
||||||
|
import java.io.File;
|
||||||
|
import java.io.IOException;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.Collection;
|
||||||
|
import javax.jms.Connection;
|
||||||
|
import javax.jms.Destination;
|
||||||
|
import javax.jms.Message;
|
||||||
|
import javax.jms.MessageConsumer;
|
||||||
|
import javax.jms.MessageProducer;
|
||||||
|
import javax.jms.Session;
|
||||||
|
import org.apache.activemq.ActiveMQConnectionFactory;
|
||||||
|
import org.apache.activemq.broker.BrokerService;
|
||||||
|
import org.apache.activemq.command.ActiveMQQueue;
|
||||||
|
import org.apache.activemq.store.kahadb.disk.journal.DataFile;
|
||||||
|
import org.apache.activemq.store.kahadb.disk.journal.Journal;
|
||||||
|
import org.apache.activemq.util.ByteSequence;
|
||||||
|
import org.apache.activemq.util.IOHelper;
|
||||||
|
import org.apache.activemq.util.RecoverableRandomAccessFile;
|
||||||
|
import org.junit.After;
|
||||||
|
import org.junit.Test;
|
||||||
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
|
|
||||||
|
|
||||||
|
import static org.junit.Assert.assertEquals;
|
||||||
|
import static org.junit.Assert.assertTrue;
|
||||||
|
|
||||||
|
|
||||||
|
public class JournalCorruptionEofIndexRecoveryTest {
|
||||||
|
|
||||||
|
private static final Logger LOG = LoggerFactory.getLogger(JournalCorruptionEofIndexRecoveryTest.class);
|
||||||
|
|
||||||
|
ActiveMQConnectionFactory cf = null;
|
||||||
|
BrokerService broker = null;
|
||||||
|
private final Destination destination = new ActiveMQQueue("Test");
|
||||||
|
private String connectionUri;
|
||||||
|
private KahaDBPersistenceAdapter adapter;
|
||||||
|
|
||||||
|
|
||||||
|
protected void startBroker() throws Exception {
|
||||||
|
doStartBroker(true, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
protected void restartBroker(boolean whackIndex) throws Exception {
|
||||||
|
restartBroker(whackIndex, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void restartBroker(boolean whackIndex, boolean forceRecoverIndex) throws Exception {
|
||||||
|
File dataDir = broker.getPersistenceAdapter().getDirectory();
|
||||||
|
if (broker != null) {
|
||||||
|
broker.stop();
|
||||||
|
broker.waitUntilStopped();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (whackIndex) {
|
||||||
|
File indexToDelete = new File(dataDir, "db.data");
|
||||||
|
LOG.info("Whacking index: " + indexToDelete);
|
||||||
|
indexToDelete.delete();
|
||||||
|
}
|
||||||
|
|
||||||
|
doStartBroker(false, forceRecoverIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private void doStartBroker(boolean delete, boolean forceRecoverIndex) throws Exception {
|
||||||
|
broker = new BrokerService();
|
||||||
|
if (delete) {
|
||||||
|
IOHelper.deleteChildren(broker.getPersistenceAdapter().getDirectory());
|
||||||
|
IOHelper.delete(broker.getPersistenceAdapter().getDirectory());
|
||||||
|
}
|
||||||
|
|
||||||
|
broker.setPersistent(true);
|
||||||
|
broker.setUseJmx(true);
|
||||||
|
broker.addConnector("tcp://localhost:0");
|
||||||
|
|
||||||
|
configurePersistence(broker, forceRecoverIndex);
|
||||||
|
|
||||||
|
connectionUri = "vm://localhost?create=false";
|
||||||
|
cf = new ActiveMQConnectionFactory(connectionUri);
|
||||||
|
|
||||||
|
broker.start();
|
||||||
|
LOG.info("Starting broker..");
|
||||||
|
}
|
||||||
|
|
||||||
|
protected void configurePersistence(BrokerService brokerService, boolean forceRecoverIndex) throws Exception {
|
||||||
|
adapter = (KahaDBPersistenceAdapter) brokerService.getPersistenceAdapter();
|
||||||
|
|
||||||
|
adapter.setForceRecoverIndex(forceRecoverIndex);
|
||||||
|
|
||||||
|
// ensure there are a bunch of data files but multiple entries in each
|
||||||
|
adapter.setJournalMaxFileLength(1024 * 20);
|
||||||
|
|
||||||
|
// speed up the test case, checkpoint an cleanup early and often
|
||||||
|
adapter.setCheckpointInterval(5000);
|
||||||
|
adapter.setCleanupInterval(5000);
|
||||||
|
|
||||||
|
adapter.setCheckForCorruptJournalFiles(true);
|
||||||
|
adapter.setIgnoreMissingJournalfiles(true);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@After
|
||||||
|
public void tearDown() throws Exception {
|
||||||
|
if (broker != null) {
|
||||||
|
broker.stop();
|
||||||
|
broker.waitUntilStopped();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRecoveryAfterCorruptionEof() throws Exception {
|
||||||
|
startBroker();
|
||||||
|
|
||||||
|
produceMessagesToConsumeMultipleDataFiles(50);
|
||||||
|
|
||||||
|
int numFiles = getNumberOfJournalFiles();
|
||||||
|
|
||||||
|
assertTrue("more than x files: " + numFiles, numFiles > 2);
|
||||||
|
|
||||||
|
corruptBatchEndEof(3);
|
||||||
|
|
||||||
|
restartBroker(false);
|
||||||
|
|
||||||
|
assertEquals("missing one message", 49, broker.getAdminView().getTotalMessageCount());
|
||||||
|
|
||||||
|
assertEquals("Drain", 49, drainQueue(49));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRecoveryAfterCorruptionCheckSum() throws Exception {
|
||||||
|
startBroker();
|
||||||
|
|
||||||
|
produceMessagesToConsumeMultipleDataFiles(4);
|
||||||
|
|
||||||
|
corruptBatchCheckSumSplash(1);
|
||||||
|
|
||||||
|
restartBroker(true);
|
||||||
|
|
||||||
|
assertEquals("missing one message", 3, broker.getAdminView().getTotalMessageCount());
|
||||||
|
|
||||||
|
assertEquals("Drain", 3, drainQueue(4));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
@Test
|
||||||
|
public void testRecoveryAfterCorruptionCheckSumExistingIndex() throws Exception {
|
||||||
|
startBroker();
|
||||||
|
|
||||||
|
produceMessagesToConsumeMultipleDataFiles(4);
|
||||||
|
|
||||||
|
corruptBatchCheckSumSplash(1);
|
||||||
|
|
||||||
|
restartBroker(false);
|
||||||
|
|
||||||
|
assertEquals("unnoticed", 4, broker.getAdminView().getTotalMessageCount());
|
||||||
|
|
||||||
|
assertEquals("Drain", 0, drainQueue(4));
|
||||||
|
|
||||||
|
// force recover index and loose one message
|
||||||
|
restartBroker(false, true);
|
||||||
|
|
||||||
|
assertEquals("missing one index recreation", 3, broker.getAdminView().getTotalMessageCount());
|
||||||
|
|
||||||
|
assertEquals("Drain", 3, drainQueue(4));
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private void corruptBatchCheckSumSplash(int id) throws Exception{
|
||||||
|
Collection<DataFile> files =
|
||||||
|
((KahaDBPersistenceAdapter) broker.getPersistenceAdapter()).getStore().getJournal().getFileMap().values();
|
||||||
|
DataFile dataFile = (DataFile) files.toArray()[0];
|
||||||
|
RecoverableRandomAccessFile randomAccessFile = dataFile.openRandomAccessFile();
|
||||||
|
|
||||||
|
ArrayList<Integer> batchPositions = findBatch(randomAccessFile, Integer.MAX_VALUE);
|
||||||
|
LOG.info("Batch positions: " + batchPositions);
|
||||||
|
int pos = batchPositions.get(1);
|
||||||
|
LOG.info("corrupting checksum and size (to push it past eof) of batch record at:" + id + "-" + pos);
|
||||||
|
randomAccessFile.seek(pos + Journal.BATCH_CONTROL_RECORD_HEADER.length + 4);
|
||||||
|
// whack the batch control record checksum
|
||||||
|
randomAccessFile.writeLong(0l);
|
||||||
|
|
||||||
|
// mod the data size in the location header so reading blows
|
||||||
|
randomAccessFile.seek(pos + Journal.BATCH_CONTROL_RECORD_SIZE);
|
||||||
|
int size = randomAccessFile.readInt();
|
||||||
|
byte type = randomAccessFile.readByte();
|
||||||
|
|
||||||
|
LOG.info("Read: size:" + size + ", type:" + type);
|
||||||
|
|
||||||
|
randomAccessFile.seek(pos + Journal.BATCH_CONTROL_RECORD_SIZE);
|
||||||
|
size -= 1;
|
||||||
|
LOG.info("rewrite incorrect location size @:" + (pos + Journal.BATCH_CONTROL_RECORD_SIZE) + " as: " + size);
|
||||||
|
randomAccessFile.writeInt(size);
|
||||||
|
|
||||||
|
randomAccessFile.getChannel().force(true);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private void corruptBatchEndEof(int id) throws Exception{
|
||||||
|
Collection<DataFile> files =
|
||||||
|
((KahaDBPersistenceAdapter) broker.getPersistenceAdapter()).getStore().getJournal().getFileMap().values();
|
||||||
|
DataFile dataFile = (DataFile) files.toArray()[id];
|
||||||
|
RecoverableRandomAccessFile randomAccessFile = dataFile.openRandomAccessFile();
|
||||||
|
|
||||||
|
ArrayList<Integer> batchPositions = findBatch(randomAccessFile, Integer.MAX_VALUE);
|
||||||
|
int pos = batchPositions.get(batchPositions.size() - 3);
|
||||||
|
LOG.info("corrupting checksum and size (to push it past eof) of batch record at:" + id + "-" + pos);
|
||||||
|
randomAccessFile.seek(pos + Journal.BATCH_CONTROL_RECORD_HEADER.length);
|
||||||
|
randomAccessFile.writeInt(31 * 1024 * 1024);
|
||||||
|
randomAccessFile.writeLong(0l);
|
||||||
|
randomAccessFile.getChannel().force(true);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
private ArrayList<Integer> findBatch(RecoverableRandomAccessFile randomAccessFile, int where) throws IOException {
|
||||||
|
final ArrayList<Integer> batchPositions = new ArrayList<Integer>();
|
||||||
|
final ByteSequence header = new ByteSequence(Journal.BATCH_CONTROL_RECORD_HEADER);
|
||||||
|
byte data[] = new byte[1024 * 20];
|
||||||
|
|
||||||
|
ByteSequence bs = new ByteSequence(data, 0, randomAccessFile.read(data, 0, data.length));
|
||||||
|
|
||||||
|
int pos = 0;
|
||||||
|
for (int i = 0; i < where; i++) {
|
||||||
|
int found = bs.indexOf(header, pos);
|
||||||
|
if (found == -1) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
batchPositions.add(found);
|
||||||
|
pos = found + Journal.BATCH_CONTROL_RECORD_HEADER.length - 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
return batchPositions;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private int getNumberOfJournalFiles() throws IOException {
|
||||||
|
|
||||||
|
Collection<DataFile> files =
|
||||||
|
((KahaDBPersistenceAdapter) broker.getPersistenceAdapter()).getStore().getJournal().getFileMap().values();
|
||||||
|
int reality = 0;
|
||||||
|
for (DataFile file : files) {
|
||||||
|
if (file != null) {
|
||||||
|
reality++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return reality;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
private int produceMessages(Destination destination, int numToSend) throws Exception {
|
||||||
|
int sent = 0;
|
||||||
|
Connection connection = new ActiveMQConnectionFactory(
|
||||||
|
broker.getTransportConnectors().get(0).getConnectUri()).createConnection();
|
||||||
|
connection.start();
|
||||||
|
try {
|
||||||
|
Session session = connection.createSession(false, Session.AUTO_ACKNOWLEDGE);
|
||||||
|
MessageProducer producer = session.createProducer(destination);
|
||||||
|
for (int i = 0; i < numToSend; i++) {
|
||||||
|
producer.send(createMessage(session, i));
|
||||||
|
sent++;
|
||||||
|
}
|
||||||
|
} finally {
|
||||||
|
connection.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
return sent;
|
||||||
|
}
|
||||||
|
|
||||||
|
private int produceMessagesToConsumeMultipleDataFiles(int numToSend) throws Exception {
|
||||||
|
return produceMessages(destination, numToSend);
|
||||||
|
}
|
||||||
|
|
||||||
|
final String payload = new String(new byte[1024]);
|
||||||
|
|
||||||
|
private Message createMessage(Session session, int i) throws Exception {
|
||||||
|
return session.createTextMessage(payload + "::" + i);
|
||||||
|
}
|
||||||
|
|
||||||
|
private int drainQueue(int max) throws Exception {
|
||||||
|
Connection connection = cf.createConnection();
|
||||||
|
connection.start();
|
||||||
|
Session session = connection.createSession(false, Session.AUTO_ACKNOWLEDGE);
|
||||||
|
MessageConsumer consumer = session.createConsumer(destination);
|
||||||
|
int count = 0;
|
||||||
|
while (count < max && consumer.receive(5000) != null) {
|
||||||
|
count++;
|
||||||
|
}
|
||||||
|
consumer.close();
|
||||||
|
connection.close();
|
||||||
|
return count;
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue