HBASE-18216 [AMv2] Workaround for HBASE-18152, corrupt procedure WAL

This commit is contained in:
Michael Stack 2017-06-02 15:38:53 -07:00
parent f50fe22196
commit 1eb2f2593d
1 changed files with 69 additions and 24 deletions

View File

@ -44,7 +44,7 @@ public class ProcedureWALFormatReader {
private static final Log LOG = LogFactory.getLog(ProcedureWALFormatReader.class); private static final Log LOG = LogFactory.getLog(ProcedureWALFormatReader.class);
// ============================================================================================== // ==============================================================================================
// We read the WALs in reverse order. from the newest to the oldest. // We read the WALs in reverse order from the newest to the oldest.
// We have different entry types: // We have different entry types:
// - INIT: Procedure submitted by the user (also known as 'root procedure') // - INIT: Procedure submitted by the user (also known as 'root procedure')
// - INSERT: Children added to the procedure <parentId>:[<childId>, ...] // - INSERT: Children added to the procedure <parentId>:[<childId>, ...]
@ -53,7 +53,8 @@ public class ProcedureWALFormatReader {
// //
// In the WAL we can find multiple times the same procedure as UPDATE or INSERT. // In the WAL we can find multiple times the same procedure as UPDATE or INSERT.
// We read the WAL from top to bottom, so every time we find an entry of the // We read the WAL from top to bottom, so every time we find an entry of the
// same procedure, that will be the "latest" update. // same procedure, that will be the "latest" update (Caveat: with multiple threads writing
// the store, this assumption does not hold).
// //
// We keep two in-memory maps: // We keep two in-memory maps:
// - localProcedureMap: is the map containing the entries in the WAL we are processing // - localProcedureMap: is the map containing the entries in the WAL we are processing
@ -65,7 +66,7 @@ public class ProcedureWALFormatReader {
// //
// The WAL is append-only so the last procedure in the WAL is the one that // The WAL is append-only so the last procedure in the WAL is the one that
// was in execution at the time we crashed/closed the server. // was in execution at the time we crashed/closed the server.
// given that, the procedure replay order can be inferred by the WAL order. // Given that, the procedure replay order can be inferred by the WAL order.
// //
// Example: // Example:
// WAL-2: [A, B, A, C, D] // WAL-2: [A, B, A, C, D]
@ -78,7 +79,7 @@ public class ProcedureWALFormatReader {
// WAL-2 localProcedureMap.replayOrder is [D, C, A, B] // WAL-2 localProcedureMap.replayOrder is [D, C, A, B]
// WAL-1 localProcedureMap.replayOrder is [F, G] // WAL-1 localProcedureMap.replayOrder is [F, G]
// //
// each time we reach the WAL-EOF, the "replayOrder" list is merged/appended in 'procedureMap' // Each time we reach the WAL-EOF, the "replayOrder" list is merged/appended in 'procedureMap'
// so using the example above we end up with: [D, C, A, B] + [F, G] as replay order. // so using the example above we end up with: [D, C, A, B] + [F, G] as replay order.
// //
// Fast Start: INIT/INSERT record and StackIDs // Fast Start: INIT/INSERT record and StackIDs
@ -319,12 +320,25 @@ public class ProcedureWALFormatReader {
protected ProcedureProtos.Procedure proto; protected ProcedureProtos.Procedure proto;
protected boolean ready = false; protected boolean ready = false;
public Entry(Entry hashNext) { this.hashNext = hashNext; } public Entry(Entry hashNext) {
this.hashNext = hashNext;
}
public long getProcId() { return proto.getProcId(); } public long getProcId() {
public long getParentId() { return proto.getParentId(); } return proto.getProcId();
public boolean hasParent() { return proto.hasParentId(); } }
public boolean isReady() { return ready; }
public long getParentId() {
return proto.getParentId();
}
public boolean hasParent() {
return proto.hasParentId();
}
public boolean isReady() {
return ready;
}
public boolean isFinished() { public boolean isFinished() {
if (!hasParent()) { if (!hasParent()) {
@ -443,11 +457,17 @@ public class ProcedureWALFormatReader {
public void add(ProcedureProtos.Procedure procProto) { public void add(ProcedureProtos.Procedure procProto) {
trackProcIds(procProto.getProcId()); trackProcIds(procProto.getProcId());
Entry entry = addToMap(procProto.getProcId(), procProto.hasParentId()); Entry entry = addToMap(procProto.getProcId(), procProto.hasParentId());
boolean isNew = entry.proto == null; boolean newEntry = entry.proto == null;
// We have seen procedure WALs where the entries are out of order; see HBASE-18152.
// To compensate, only replace the Entry procedure if for sure this new procedure
// is indeed an entry that came later. TODO: Fix the writing of procedure info so
// it does not violate basic expectation, that WALs contain procedure changes going
// from start to finish in sequence.
if (newEntry || isIncreasing(entry.proto, procProto)) {
entry.proto = procProto; entry.proto = procProto;
}
addToReplayList(entry); addToReplayList(entry);
if(newEntry) {
if (isNew) {
if (procProto.hasParentId()) { if (procProto.hasParentId()) {
childUnlinkedHead = addToLinkList(entry, childUnlinkedHead); childUnlinkedHead = addToLinkList(entry, childUnlinkedHead);
} else { } else {
@ -456,6 +476,21 @@ public class ProcedureWALFormatReader {
} }
} }
/**
* @return True if this new procedure is 'richer' than the current one else
* false and we log this incidence where it appears that the WAL has older entries
* appended after newer ones. See HBASE-18152.
*/
private static boolean isIncreasing(ProcedureProtos.Procedure current,
ProcedureProtos.Procedure candidate) {
boolean increasing = current.getStackIdCount() < candidate.getStackIdCount() &&
current.getLastUpdate() <= candidate.getLastUpdate();
if (!increasing) {
LOG.warn("NOT INCREASING! current=" + current + ", candidate=" + candidate);
}
return increasing;
}
public boolean remove(long procId) { public boolean remove(long procId) {
trackProcIds(procId); trackProcIds(procId);
Entry entry = removeFromMap(procId); Entry entry = removeFromMap(procId);
@ -634,7 +669,7 @@ public class ProcedureWALFormatReader {
} }
/* /*
* (see the comprehensive explaination in the beginning of the file) * (see the comprehensive explanation in the beginning of the file)
* A Procedure is ready when parent and children are ready. * A Procedure is ready when parent and children are ready.
* "ready" means that we all the information that we need in-memory. * "ready" means that we all the information that we need in-memory.
* *
@ -651,9 +686,9 @@ public class ProcedureWALFormatReader {
* - easy case, the parent is missing from the global map * - easy case, the parent is missing from the global map
* - more complex case we look at the Stack IDs. * - more complex case we look at the Stack IDs.
* *
* The Stack-IDs are added to the procedure order as incremental index * The Stack-IDs are added to the procedure order as an incremental index
* tracking how many times that procedure was executed, which is equivalent * tracking how many times that procedure was executed, which is equivalent
* at the number of times we wrote the procedure to the WAL. * to the number of times we wrote the procedure to the WAL.
* In the example above: * In the example above:
* wal-2: B has stackId = [1, 2] * wal-2: B has stackId = [1, 2]
* wal-1: B has stackId = [1] * wal-1: B has stackId = [1]
@ -663,7 +698,7 @@ public class ProcedureWALFormatReader {
* we notice that there is a gap in the stackIds of B, so something was * we notice that there is a gap in the stackIds of B, so something was
* executed before. * executed before.
* To identify when a Procedure is ready we do the sum of the stackIds of * To identify when a Procedure is ready we do the sum of the stackIds of
* the procedure and the parent. if the stackIdSum is equals to the * the procedure and the parent. if the stackIdSum is equal to the
* sum of {1..maxStackId} then everything we need is available. * sum of {1..maxStackId} then everything we need is available.
* *
* Example-2 * Example-2
@ -695,6 +730,10 @@ public class ProcedureWALFormatReader {
int stackId = 1 + rootEntry.proto.getStackId(i); int stackId = 1 + rootEntry.proto.getStackId(i);
maxStackId = Math.max(maxStackId, stackId); maxStackId = Math.max(maxStackId, stackId);
stackIdSum += stackId; stackIdSum += stackId;
if (LOG.isTraceEnabled()) {
LOG.trace("stackId=" + stackId + " stackIdSum=" + stackIdSum +
" maxStackid=" + maxStackId + " " + rootEntry);
}
} }
for (Entry p = rootEntry.childHead; p != null; p = p.linkNext) { for (Entry p = rootEntry.childHead; p != null; p = p.linkNext) {
@ -702,8 +741,14 @@ public class ProcedureWALFormatReader {
int stackId = 1 + p.proto.getStackId(i); int stackId = 1 + p.proto.getStackId(i);
maxStackId = Math.max(maxStackId, stackId); maxStackId = Math.max(maxStackId, stackId);
stackIdSum += stackId; stackIdSum += stackId;
if (LOG.isTraceEnabled()) {
LOG.trace("stackId=" + stackId + " stackIdSum=" + stackIdSum +
" maxStackid=" + maxStackId + " " + p);
} }
} }
}
// The cmpStackIdSum is this formula for finding the sum of a series of numbers:
// http://www.wikihow.com/Sum-the-Integers-from-1-to-N#/Image:Sum-the-Integers-from-1-to-N-Step-2-Version-3.jpg
final int cmpStackIdSum = (maxStackId * (maxStackId + 1) / 2); final int cmpStackIdSum = (maxStackId * (maxStackId + 1) / 2);
if (cmpStackIdSum == stackIdSum) { if (cmpStackIdSum == stackIdSum) {
rootEntry.ready = true; rootEntry.ready = true;