hbase-8763: Combine MVCC and SeqId

This commit is contained in:
Jeffrey Zhong 2014-06-06 18:25:46 -07:00
parent d6cc2fb1ea
commit c682d57e92
20 changed files with 555 additions and 339 deletions

View File

@ -45,6 +45,7 @@ import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.ClassSize; import org.apache.hadoop.hbase.util.ClassSize;
import org.apache.hadoop.hbase.util.CollectionBackedScanner; import org.apache.hadoop.hbase.util.CollectionBackedScanner;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.ReflectionUtils; import org.apache.hadoop.hbase.util.ReflectionUtils;
/** /**
@ -210,12 +211,13 @@ public class DefaultMemStore implements MemStore {
/** /**
* Write an update * Write an update
* @param cell * @param cell
* @return approximate size of the passed key and value. * @return approximate size of the passed KV & newly added KV which maybe different than the
* passed-in KV
*/ */
@Override @Override
public long add(Cell cell) { public Pair<Long, Cell> add(Cell cell) {
KeyValue toAdd = maybeCloneWithAllocator(KeyValueUtil.ensureKeyValue(cell)); KeyValue toAdd = maybeCloneWithAllocator(KeyValueUtil.ensureKeyValue(cell));
return internalAdd(toAdd); return new Pair<Long, Cell>(internalAdd(toAdd), toAdd);
} }
@Override @Override
@ -1051,18 +1053,21 @@ public class DefaultMemStore implements MemStore {
byte [] empty = new byte[0]; byte [] empty = new byte[0];
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
// Give each its own ts // Give each its own ts
size += memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty)); Pair<Long, Cell> ret = memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty));
size += ret.getFirst();
} }
LOG.info("memstore1 estimated size=" + size); LOG.info("memstore1 estimated size=" + size);
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
size += memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty)); Pair<Long, Cell> ret = memstore1.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, empty));
size += ret.getFirst();
} }
LOG.info("memstore1 estimated size (2nd loading of same data)=" + size); LOG.info("memstore1 estimated size (2nd loading of same data)=" + size);
// Make a variably sized memstore. // Make a variably sized memstore.
DefaultMemStore memstore2 = new DefaultMemStore(); DefaultMemStore memstore2 = new DefaultMemStore();
for (int i = 0; i < count; i++) { for (int i = 0; i < count; i++) {
size += memstore2.add(new KeyValue(Bytes.toBytes(i), fam, qf, i, Pair<Long, Cell> ret = memstore2.add(new KeyValue(Bytes.toBytes(i), fam, qf, i,
new byte[i])); new byte[i]));
size += ret.getFirst();
} }
LOG.info("memstore2 estimated size=" + size); LOG.info("memstore2 estimated size=" + size);
final int seconds = 30; final int seconds = 30;

View File

@ -824,10 +824,11 @@ public class HRegion implements HeapSize { // , Writable{
} }
} }
} }
mvcc.initialize(maxMemstoreTS + 1);
// Recover any edits if available. // Recover any edits if available.
maxSeqId = Math.max(maxSeqId, replayRecoveredEditsIfAny( maxSeqId = Math.max(maxSeqId, replayRecoveredEditsIfAny(
this.fs.getRegionDir(), maxSeqIdInStores, reporter, status)); this.fs.getRegionDir(), maxSeqIdInStores, reporter, status));
maxSeqId = Math.max(maxSeqId, maxMemstoreTS + 1);
mvcc.initialize(maxSeqId);
return maxSeqId; return maxSeqId;
} }
@ -1684,7 +1685,7 @@ public class HRegion implements HeapSize { // , Writable{
// wal can be null replaying edits. // wal can be null replaying edits.
return wal != null? return wal != null?
new FlushResult(FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, new FlushResult(FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY,
getNextSequenceId(wal, startTime), "Nothing to flush"): getNextSequenceId(wal), "Nothing to flush"):
new FlushResult(FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, "Nothing to flush"); new FlushResult(FlushResult.Result.CANNOT_FLUSH_MEMSTORE_EMPTY, "Nothing to flush");
} }
} finally { } finally {
@ -1714,58 +1715,64 @@ public class HRegion implements HeapSize { // , Writable{
getRegionInfo().getEncodedName()); getRegionInfo().getEncodedName());
List<StoreFlushContext> storeFlushCtxs = new ArrayList<StoreFlushContext>(stores.size()); List<StoreFlushContext> storeFlushCtxs = new ArrayList<StoreFlushContext>(stores.size());
long flushSeqId = -1L; long flushSeqId = -1L;
try { try {
// Record the mvcc for all transactions in progress. try {
w = mvcc.beginMemstoreInsert(); w = mvcc.beginMemstoreInsert();
mvcc.advanceMemstore(w); if (wal != null) {
if (wal != null) { if (!wal.startCacheFlush(this.getRegionInfo().getEncodedNameAsBytes())) {
if (!wal.startCacheFlush(this.getRegionInfo().getEncodedNameAsBytes())) { // This should never happen.
// This should never happen. String msg = "Flush will not be started for ["
String msg = "Flush will not be started for [" + this.getRegionInfo().getEncodedName() + "] - because the WAL is closing.";
+ this.getRegionInfo().getEncodedName() + "] - because the WAL is closing."; status.setStatus(msg);
status.setStatus(msg); return new FlushResult(FlushResult.Result.CANNOT_FLUSH, msg);
return new FlushResult(FlushResult.Result.CANNOT_FLUSH, msg); }
// Get a sequence id that we can use to denote the flush. It will be one beyond the last
// edit that made it into the hfile (the below does not add an edit, it just asks the
// WAL system to return next sequence edit).
flushSeqId = getNextSequenceId(wal);
} else {
// use the provided sequence Id as WAL is not being used for this flush.
flushSeqId = myseqid;
} }
// Get a sequence id that we can use to denote the flush. It will be one beyond the last
// edit that made it into the hfile (the below does not add an edit, it just asks the
// WAL system to return next sequence edit).
flushSeqId = getNextSequenceId(wal, startTime);
} else {
// use the provided sequence Id as WAL is not being used for this flush.
flushSeqId = myseqid;
}
for (Store s : stores.values()) { for (Store s : stores.values()) {
totalFlushableSize += s.getFlushableSize(); totalFlushableSize += s.getFlushableSize();
storeFlushCtxs.add(s.createFlushContext(flushSeqId)); storeFlushCtxs.add(s.createFlushContext(flushSeqId));
} }
// Prepare flush (take a snapshot) // Prepare flush (take a snapshot)
for (StoreFlushContext flush : storeFlushCtxs) { for (StoreFlushContext flush : storeFlushCtxs) {
flush.prepare(); flush.prepare();
}
} finally {
this.updatesLock.writeLock().unlock();
} }
String s = "Finished memstore snapshotting " + this +
", syncing WAL and waiting on mvcc, flushsize=" + totalFlushableSize;
status.setStatus(s);
if (LOG.isTraceEnabled()) LOG.trace(s);
// sync unflushed WAL changes when deferred log sync is enabled
// see HBASE-8208 for details
if (wal != null && !shouldSyncLog()) wal.sync();
// wait for all in-progress transactions to commit to HLog before
// we can start the flush. This prevents
// uncommitted transactions from being written into HFiles.
// We have to block before we start the flush, otherwise keys that
// were removed via a rollbackMemstore could be written to Hfiles.
mvcc.waitForPreviousTransactionsComplete(w);
// set w to null to prevent mvcc.advanceMemstore from being called again inside finally block
w = null;
s = "Flushing stores of " + this;
status.setStatus(s);
if (LOG.isTraceEnabled()) LOG.trace(s);
} finally { } finally {
this.updatesLock.writeLock().unlock(); if (w != null) {
// in case of failure just mark current w as complete
mvcc.advanceMemstore(w);
}
} }
String s = "Finished memstore snapshotting " + this +
", syncing WAL and waiting on mvcc, flushSize=" + totalFlushableSize;
status.setStatus(s);
if (LOG.isTraceEnabled()) LOG.trace(s);
// sync unflushed WAL changes when deferred log sync is enabled
// see HBASE-8208 for details
if (wal != null && !shouldSyncLog()) wal.sync();
// wait for all in-progress transactions to commit to HLog before
// we can start the flush. This prevents
// uncommitted transactions from being written into HFiles.
// We have to block before we start the flush, otherwise keys that
// were removed via a rollbackMemstore could be written to Hfiles.
mvcc.waitForRead(w);
s = "Flushing stores of " + this;
status.setStatus(s);
if (LOG.isTraceEnabled()) LOG.trace(s);
// Any failure from here on out will be catastrophic requiring server // Any failure from here on out will be catastrophic requiring server
// restart so hlog content can be replayed and put back into the memstore. // restart so hlog content can be replayed and put back into the memstore.
@ -1849,13 +1856,9 @@ public class HRegion implements HeapSize { // , Writable{
* @return Next sequence number unassociated with any actual edit. * @return Next sequence number unassociated with any actual edit.
* @throws IOException * @throws IOException
*/ */
private long getNextSequenceId(final HLog wal, final long now) throws IOException { private long getNextSequenceId(final HLog wal) throws IOException {
HLogKey key = new HLogKey(getRegionInfo().getEncodedNameAsBytes(), getRegionInfo().getTable()); HLogKey key = this.appendNoSyncNoAppend(wal, null);
// Call append but with an empty WALEdit. The returned sequence id will not be associated return key.getSequenceNumber();
// with any edit and we can be sure it went in after all outstanding appends.
wal.appendNoSync(getTableDesc(), getRegionInfo(), key,
WALEdit.EMPTY_WALEDIT, this.sequenceId, false);
return key.getLogSeqNum();
} }
////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////
@ -2349,11 +2352,14 @@ public class HRegion implements HeapSize { // , Writable{
List<RowLock> acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.operations.length); List<RowLock> acquiredRowLocks = Lists.newArrayListWithCapacity(batchOp.operations.length);
// reference family maps directly so coprocessors can mutate them if desired // reference family maps directly so coprocessors can mutate them if desired
Map<byte[], List<Cell>>[] familyMaps = new Map[batchOp.operations.length]; Map<byte[], List<Cell>>[] familyMaps = new Map[batchOp.operations.length];
List<KeyValue> memstoreCells = new ArrayList<KeyValue>();
// We try to set up a batch in the range [firstIndex,lastIndexExclusive) // We try to set up a batch in the range [firstIndex,lastIndexExclusive)
int firstIndex = batchOp.nextIndexToProcess; int firstIndex = batchOp.nextIndexToProcess;
int lastIndexExclusive = firstIndex; int lastIndexExclusive = firstIndex;
boolean success = false; boolean success = false;
int noOfPuts = 0, noOfDeletes = 0; int noOfPuts = 0, noOfDeletes = 0;
HLogKey walKey = null;
long mvccNum = 0;
try { try {
// ------------------------------------ // ------------------------------------
// STEP 1. Try to acquire as many locks as we can, and ensure // STEP 1. Try to acquire as many locks as we can, and ensure
@ -2475,13 +2481,13 @@ public class HRegion implements HeapSize { // , Writable{
lock(this.updatesLock.readLock(), numReadyToWrite); lock(this.updatesLock.readLock(), numReadyToWrite);
locked = true; locked = true;
mvccNum = MultiVersionConsistencyControl.getPreAssignedWriteNumber(this.sequenceId);
// //
// ------------------------------------ // ------------------------------------
// Acquire the latest mvcc number // Acquire the latest mvcc number
// ---------------------------------- // ----------------------------------
w = mvcc.beginMemstoreInsert(); w = mvcc.beginMemstoreInsertWithSeqNum(mvccNum);
// calling the pre CP hook for batch mutation // calling the pre CP hook for batch mutation
if (!isInReplay && coprocessorHost != null) { if (!isInReplay && coprocessorHost != null) {
MiniBatchOperationInProgress<Mutation> miniBatchOp = MiniBatchOperationInProgress<Mutation> miniBatchOp =
@ -2506,13 +2512,12 @@ public class HRegion implements HeapSize { // , Writable{
continue; continue;
} }
doRollBackMemstore = true; // If we have a failure, we need to clean what we wrote doRollBackMemstore = true; // If we have a failure, we need to clean what we wrote
addedSize += applyFamilyMapToMemstore(familyMaps[i], w); addedSize += applyFamilyMapToMemstore(familyMaps[i], mvccNum, memstoreCells);
} }
// ------------------------------------ // ------------------------------------
// STEP 4. Build WAL edit // STEP 4. Build WAL edit
// ---------------------------------- // ----------------------------------
boolean hasWalAppends = false;
Durability durability = Durability.USE_DEFAULT; Durability durability = Durability.USE_DEFAULT;
for (int i = firstIndex; i < lastIndexExclusive; i++) { for (int i = firstIndex; i < lastIndexExclusive; i++) {
// Skip puts that were determined to be invalid during preprocessing // Skip puts that were determined to be invalid during preprocessing
@ -2543,13 +2548,13 @@ public class HRegion implements HeapSize { // , Writable{
throw new IOException("Multiple nonces per batch and not in replay"); throw new IOException("Multiple nonces per batch and not in replay");
} }
// txid should always increase, so having the one from the last call is ok. // txid should always increase, so having the one from the last call is ok.
HLogKey key = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(), walKey = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
this.htableDescriptor.getTableName(), now, m.getClusterIds(), currentNonceGroup, this.htableDescriptor.getTableName(), now, m.getClusterIds(),
currentNonce); currentNonceGroup, currentNonce);
txid = this.log.appendNoSync(this.htableDescriptor, this.getRegionInfo(), key, txid = this.log.appendNoSync(this.htableDescriptor, this.getRegionInfo(), walKey,
walEdit, getSequenceId(), true); walEdit, getSequenceId(), true, null);
hasWalAppends = true;
walEdit = new WALEdit(isInReplay); walEdit = new WALEdit(isInReplay);
walKey = null;
} }
currentNonceGroup = nonceGroup; currentNonceGroup = nonceGroup;
currentNonce = nonce; currentNonce = nonce;
@ -2570,12 +2575,15 @@ public class HRegion implements HeapSize { // , Writable{
// ------------------------- // -------------------------
Mutation mutation = batchOp.getMutation(firstIndex); Mutation mutation = batchOp.getMutation(firstIndex);
if (walEdit.size() > 0) { if (walEdit.size() > 0) {
HLogKey key = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(), walKey = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
this.htableDescriptor.getTableName(), now, mutation.getClusterIds(), this.htableDescriptor.getTableName(), HLog.NO_SEQUENCE_ID, now,
currentNonceGroup, currentNonce); mutation.getClusterIds(), currentNonceGroup, currentNonce);
txid = this.log.appendNoSync(this.htableDescriptor, this.getRegionInfo(), key, walEdit, txid = this.log.appendNoSync(this.htableDescriptor, this.getRegionInfo(), walKey, walEdit,
getSequenceId(), true); getSequenceId(), true, memstoreCells);
hasWalAppends = true; }
if(walKey == null){
// Append a faked WALEdit in order for SKIP_WAL updates to get mvcc assigned
walKey = this.appendNoSyncNoAppend(this.log, memstoreCells);
} }
// ------------------------------- // -------------------------------
@ -2590,9 +2598,10 @@ public class HRegion implements HeapSize { // , Writable{
// ------------------------- // -------------------------
// STEP 7. Sync wal. // STEP 7. Sync wal.
// ------------------------- // -------------------------
if (hasWalAppends) { if (txid != 0) {
syncOrDefer(txid, durability); syncOrDefer(txid, durability);
} }
doRollBackMemstore = false; doRollBackMemstore = false;
// calling the post CP hook for batch mutation // calling the post CP hook for batch mutation
if (!isInReplay && coprocessorHost != null) { if (!isInReplay && coprocessorHost != null) {
@ -2606,7 +2615,7 @@ public class HRegion implements HeapSize { // , Writable{
// STEP 8. Advance mvcc. This will make this put visible to scanners and getters. // STEP 8. Advance mvcc. This will make this put visible to scanners and getters.
// ------------------------------------------------------------------ // ------------------------------------------------------------------
if (w != null) { if (w != null) {
mvcc.completeMemstoreInsert(w); mvcc.completeMemstoreInsertWithSeqNum(w, walKey);
w = null; w = null;
} }
@ -2636,9 +2645,11 @@ public class HRegion implements HeapSize { // , Writable{
// if the wal sync was unsuccessful, remove keys from memstore // if the wal sync was unsuccessful, remove keys from memstore
if (doRollBackMemstore) { if (doRollBackMemstore) {
rollbackMemstore(batchOp, familyMaps, firstIndex, lastIndexExclusive); rollbackMemstore(memstoreCells);
}
if (w != null) {
mvcc.completeMemstoreInsertWithSeqNum(w, walKey);
} }
if (w != null) mvcc.completeMemstoreInsert(w);
if (locked) { if (locked) {
this.updatesLock.readLock().unlock(); this.updatesLock.readLock().unlock();
@ -2727,7 +2738,7 @@ public class HRegion implements HeapSize { // , Writable{
// Lock row - note that doBatchMutate will relock this row if called // Lock row - note that doBatchMutate will relock this row if called
RowLock rowLock = getRowLock(get.getRow()); RowLock rowLock = getRowLock(get.getRow());
// wait for all previous transactions to complete (with lock held) // wait for all previous transactions to complete (with lock held)
mvcc.completeMemstoreInsert(mvcc.beginMemstoreInsert()); mvcc.waitForPreviousTransactionsComplete();
try { try {
if (this.getCoprocessorHost() != null) { if (this.getCoprocessorHost() != null) {
Boolean processed = null; Boolean processed = null;
@ -2903,34 +2914,25 @@ public class HRegion implements HeapSize { // , Writable{
* @param familyMap Map of kvs per family * @param familyMap Map of kvs per family
* @param localizedWriteEntry The WriteEntry of the MVCC for this transaction. * @param localizedWriteEntry The WriteEntry of the MVCC for this transaction.
* If null, then this method internally creates a mvcc transaction. * If null, then this method internally creates a mvcc transaction.
* @param output newly added KVs into memstore
* @return the additional memory usage of the memstore caused by the * @return the additional memory usage of the memstore caused by the
* new entries. * new entries.
*/ */
private long applyFamilyMapToMemstore(Map<byte[], List<Cell>> familyMap, private long applyFamilyMapToMemstore(Map<byte[], List<Cell>> familyMap,
MultiVersionConsistencyControl.WriteEntry localizedWriteEntry) { long mvccNum, List<KeyValue> memstoreCells) {
long size = 0; long size = 0;
boolean freemvcc = false;
try { for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
if (localizedWriteEntry == null) { byte[] family = e.getKey();
localizedWriteEntry = mvcc.beginMemstoreInsert(); List<Cell> cells = e.getValue();
freemvcc = true;
}
for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) { Store store = getStore(family);
byte[] family = e.getKey(); for (Cell cell: cells) {
List<Cell> cells = e.getValue(); KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
kv.setMvccVersion(mvccNum);
Store store = getStore(family); Pair<Long, Cell> ret = store.add(kv);
for (Cell cell: cells) { size += ret.getFirst();
KeyValue kv = KeyValueUtil.ensureKeyValue(cell); memstoreCells.add(KeyValueUtil.ensureKeyValue(ret.getSecond()));
kv.setMvccVersion(localizedWriteEntry.getWriteNumber());
size += store.add(kv);
}
}
} finally {
if (freemvcc) {
mvcc.completeMemstoreInsert(localizedWriteEntry);
} }
} }
@ -2942,35 +2944,16 @@ public class HRegion implements HeapSize { // , Writable{
* called when a Put/Delete has updated memstore but subsequently fails to update * called when a Put/Delete has updated memstore but subsequently fails to update
* the wal. This method is then invoked to rollback the memstore. * the wal. This method is then invoked to rollback the memstore.
*/ */
private void rollbackMemstore(BatchOperationInProgress<?> batchOp, private void rollbackMemstore(List<KeyValue> memstoreCells) {
Map<byte[], List<Cell>>[] familyMaps,
int start, int end) {
int kvsRolledback = 0; int kvsRolledback = 0;
for (int i = start; i < end; i++) {
// skip over request that never succeeded in the first place. for (KeyValue kv : memstoreCells) {
if (batchOp.retCodeDetails[i].getOperationStatusCode() byte[] family = kv.getFamily();
!= OperationStatusCode.SUCCESS) { Store store = getStore(family);
continue; store.rollback(kv);
} kvsRolledback++;
// Rollback all the kvs for this row.
Map<byte[], List<Cell>> familyMap = familyMaps[i];
for (Map.Entry<byte[], List<Cell>> e : familyMap.entrySet()) {
byte[] family = e.getKey();
List<Cell> cells = e.getValue();
// Remove those keys from the memstore that matches our
// key's (row, cf, cq, timestamp, memstoreTS). The interesting part is
// that even the memstoreTS has to match for keys that will be rolled-back.
Store store = getStore(family);
for (Cell cell: cells) {
store.rollback(KeyValueUtil.ensureKeyValue(cell));
kvsRolledback++;
}
}
} }
LOG.debug("rollbackMemstore rolled back " + kvsRolledback + LOG.debug("rollbackMemstore rolled back " + kvsRolledback);
" keyvalues from start:" + start + " to end:" + end);
} }
/** /**
@ -3378,7 +3361,7 @@ public class HRegion implements HeapSize { // , Writable{
* @return True if we should flush. * @return True if we should flush.
*/ */
protected boolean restoreEdit(final Store s, final KeyValue kv) { protected boolean restoreEdit(final Store s, final KeyValue kv) {
long kvSize = s.add(kv); long kvSize = s.add(kv).getFirst();
if (this.rsAccounting != null) { if (this.rsAccounting != null) {
rsAccounting.addAndGetRegionReplayEditsSize(this.getRegionName(), kvSize); rsAccounting.addAndGetRegionReplayEditsSize(this.getRegionName(), kvSize);
} }
@ -4883,7 +4866,10 @@ public class HRegion implements HeapSize { // , Writable{
List<RowLock> acquiredRowLocks; List<RowLock> acquiredRowLocks;
long addedSize = 0; long addedSize = 0;
List<KeyValue> mutations = new ArrayList<KeyValue>(); List<KeyValue> mutations = new ArrayList<KeyValue>();
List<KeyValue> memstoreCells = new ArrayList<KeyValue>();
Collection<byte[]> rowsToLock = processor.getRowsToLock(); Collection<byte[]> rowsToLock = processor.getRowsToLock();
long mvccNum = 0;
HLogKey walKey = null;
try { try {
// 2. Acquire the row lock(s) // 2. Acquire the row lock(s)
acquiredRowLocks = new ArrayList<RowLock>(rowsToLock.size()); acquiredRowLocks = new ArrayList<RowLock>(rowsToLock.size());
@ -4894,6 +4880,7 @@ public class HRegion implements HeapSize { // , Writable{
// 3. Region lock // 3. Region lock
lock(this.updatesLock.readLock(), acquiredRowLocks.size()); lock(this.updatesLock.readLock(), acquiredRowLocks.size());
locked = true; locked = true;
mvccNum = MultiVersionConsistencyControl.getPreAssignedWriteNumber(this.sequenceId);
long now = EnvironmentEdgeManager.currentTimeMillis(); long now = EnvironmentEdgeManager.currentTimeMillis();
try { try {
@ -4904,27 +4891,35 @@ public class HRegion implements HeapSize { // , Writable{
if (!mutations.isEmpty()) { if (!mutations.isEmpty()) {
// 5. Get a mvcc write number // 5. Get a mvcc write number
writeEntry = mvcc.beginMemstoreInsert(); writeEntry = mvcc.beginMemstoreInsertWithSeqNum(mvccNum);
// 6. Apply to memstore // 6. Apply to memstore
for (KeyValue kv : mutations) { for (KeyValue kv : mutations) {
kv.setMvccVersion(writeEntry.getWriteNumber()); kv.setMvccVersion(mvccNum);
Store store = getStore(kv); Store store = getStore(kv);
if (store == null) { if (store == null) {
checkFamily(CellUtil.cloneFamily(kv)); checkFamily(CellUtil.cloneFamily(kv));
// unreachable // unreachable
} }
addedSize += store.add(kv); Pair<Long, Cell> ret = store.add(kv);
addedSize += ret.getFirst();
memstoreCells.add(KeyValueUtil.ensureKeyValue(ret.getSecond()));
} }
long txid = 0; long txid = 0;
// 7. Append no sync // 7. Append no sync
if (!walEdit.isEmpty()) { if (!walEdit.isEmpty()) {
HLogKey key = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(), walKey = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
this.htableDescriptor.getTableName(), now, processor.getClusterIds(), nonceGroup, this.htableDescriptor.getTableName(), HLog.NO_SEQUENCE_ID, now,
nonce); processor.getClusterIds(), nonceGroup, nonce);
txid = this.log.appendNoSync(this.htableDescriptor, this.getRegionInfo(), txid = this.log.appendNoSync(this.htableDescriptor, this.getRegionInfo(),
key, walEdit, getSequenceId(), true); walKey, walEdit, getSequenceId(), true, memstoreCells);
} }
if(walKey == null){
// since we use log sequence Id as mvcc, for SKIP_WAL changes we need a "faked" WALEdit
// to get a sequence id assigned which is done by FSWALEntry#stampRegionSequenceId
walKey = this.appendNoSyncNoAppend(this.log, memstoreCells);
}
// 8. Release region lock // 8. Release region lock
if (locked) { if (locked) {
this.updatesLock.readLock().unlock(); this.updatesLock.readLock().unlock();
@ -4951,7 +4946,7 @@ public class HRegion implements HeapSize { // , Writable{
} }
// 11. Roll mvcc forward // 11. Roll mvcc forward
if (writeEntry != null) { if (writeEntry != null) {
mvcc.completeMemstoreInsert(writeEntry); mvcc.completeMemstoreInsertWithSeqNum(writeEntry, walKey);
} }
if (locked) { if (locked) {
this.updatesLock.readLock().unlock(); this.updatesLock.readLock().unlock();
@ -5055,8 +5050,12 @@ public class HRegion implements HeapSize { // , Writable{
// Lock row // Lock row
startRegionOperation(Operation.APPEND); startRegionOperation(Operation.APPEND);
this.writeRequestsCount.increment(); this.writeRequestsCount.increment();
long mvccNum = 0;
WriteEntry w = null; WriteEntry w = null;
RowLock rowLock; HLogKey walKey = null;
RowLock rowLock = null;
List<KeyValue> memstoreCells = new ArrayList<KeyValue>();
boolean doRollBackMemstore = false;
try { try {
rowLock = getRowLock(row); rowLock = getRowLock(row);
try { try {
@ -5064,7 +5063,7 @@ public class HRegion implements HeapSize { // , Writable{
try { try {
// wait for all prior MVCC transactions to finish - while we hold the row lock // wait for all prior MVCC transactions to finish - while we hold the row lock
// (so that we are guaranteed to see the latest state) // (so that we are guaranteed to see the latest state)
mvcc.completeMemstoreInsert(mvcc.beginMemstoreInsert()); mvcc.waitForPreviousTransactionsComplete();
if (this.coprocessorHost != null) { if (this.coprocessorHost != null) {
Result r = this.coprocessorHost.preAppendAfterRowLock(append); Result r = this.coprocessorHost.preAppendAfterRowLock(append);
if(r!= null) { if(r!= null) {
@ -5072,7 +5071,8 @@ public class HRegion implements HeapSize { // , Writable{
} }
} }
// now start my own transaction // now start my own transaction
w = mvcc.beginMemstoreInsert(); mvccNum = MultiVersionConsistencyControl.getPreAssignedWriteNumber(this.sequenceId);
w = mvcc.beginMemstoreInsertWithSeqNum(mvccNum);
long now = EnvironmentEdgeManager.currentTimeMillis(); long now = EnvironmentEdgeManager.currentTimeMillis();
// Process each family // Process each family
for (Map.Entry<byte[], List<Cell>> family : append.getFamilyCellMap().entrySet()) { for (Map.Entry<byte[], List<Cell>> family : append.getFamilyCellMap().entrySet()) {
@ -5140,7 +5140,7 @@ public class HRegion implements HeapSize { // , Writable{
// so only need to update the timestamp to 'now' // so only need to update the timestamp to 'now'
newKV.updateLatestStamp(Bytes.toBytes(now)); newKV.updateLatestStamp(Bytes.toBytes(now));
} }
newKV.setMvccVersion(w.getWriteNumber()); newKV.setMvccVersion(mvccNum);
// Give coprocessors a chance to update the new cell // Give coprocessors a chance to update the new cell
if (coprocessorHost != null) { if (coprocessorHost != null) {
newKV = KeyValueUtil.ensureKeyValue(coprocessorHost.postMutationBeforeWAL( newKV = KeyValueUtil.ensureKeyValue(coprocessorHost.postMutationBeforeWAL(
@ -5161,34 +5161,43 @@ public class HRegion implements HeapSize { // , Writable{
tempMemstore.put(store, kvs); tempMemstore.put(store, kvs);
} }
// Actually write to WAL now
if (writeToWAL) {
// Using default cluster id, as this can only happen in the originating
// cluster. A slave cluster receives the final value (not the delta)
// as a Put.
HLogKey key = new HLogKey(getRegionInfo().getEncodedNameAsBytes(),
this.htableDescriptor.getTableName(), nonceGroup, nonce);
txid = this.log.appendNoSync(this.htableDescriptor, getRegionInfo(), key, walEdits,
this.sequenceId, true);
} else {
recordMutationWithoutWal(append.getFamilyCellMap());
}
//Actually write to Memstore now //Actually write to Memstore now
for (Map.Entry<Store, List<Cell>> entry : tempMemstore.entrySet()) { for (Map.Entry<Store, List<Cell>> entry : tempMemstore.entrySet()) {
Store store = entry.getKey(); Store store = entry.getKey();
if (store.getFamily().getMaxVersions() == 1) { if (store.getFamily().getMaxVersions() == 1) {
// upsert if VERSIONS for this CF == 1 // upsert if VERSIONS for this CF == 1
size += store.upsert(entry.getValue(), getSmallestReadPoint()); size += store.upsert(entry.getValue(), getSmallestReadPoint());
memstoreCells.addAll(KeyValueUtil.ensureKeyValues(entry.getValue()));
} else { } else {
// otherwise keep older versions around // otherwise keep older versions around
for (Cell cell: entry.getValue()) { for (Cell cell: entry.getValue()) {
KeyValue kv = KeyValueUtil.ensureKeyValue(cell); KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
size += store.add(kv); Pair<Long, Cell> ret = store.add(kv);
size += ret.getFirst();
memstoreCells.add(KeyValueUtil.ensureKeyValue(ret.getSecond()));
doRollBackMemstore = true;
} }
} }
allKVs.addAll(entry.getValue()); allKVs.addAll(entry.getValue());
} }
// Actually write to WAL now
if (writeToWAL) {
// Using default cluster id, as this can only happen in the originating
// cluster. A slave cluster receives the final value (not the delta)
// as a Put.
walKey = new HLogKey(getRegionInfo().getEncodedNameAsBytes(),
this.htableDescriptor.getTableName(), HLog.NO_SEQUENCE_ID, nonceGroup, nonce);
txid = this.log.appendNoSync(this.htableDescriptor, getRegionInfo(), walKey, walEdits,
this.sequenceId, true, memstoreCells);
} else {
recordMutationWithoutWal(append.getFamilyCellMap());
}
if(walKey == null){
// Append a faked WALEdit in order for SKIP_WAL updates to get mvcc assigned
walKey = this.appendNoSyncNoAppend(this.log, memstoreCells);
}
size = this.addAndGetGlobalMemstoreSize(size); size = this.addAndGetGlobalMemstoreSize(size);
flush = isFlushSize(size); flush = isFlushSize(size);
} finally { } finally {
@ -5196,14 +5205,23 @@ public class HRegion implements HeapSize { // , Writable{
} }
} finally { } finally {
rowLock.release(); rowLock.release();
rowLock = null;
} }
if (writeToWAL) { // sync the transaction log outside the rowlock
// sync the transaction log outside the rowlock if(txid != 0){
syncOrDefer(txid, durability); syncOrDefer(txid, durability);
} }
doRollBackMemstore = false;
} finally { } finally {
if (rowLock != null) {
rowLock.release();
}
// if the wal sync was unsuccessful, remove keys from memstore
if (doRollBackMemstore) {
rollbackMemstore(memstoreCells);
}
if (w != null) { if (w != null) {
mvcc.completeMemstoreInsert(w); mvcc.completeMemstoreInsertWithSeqNum(w, walKey);
} }
closeRegionOperation(Operation.APPEND); closeRegionOperation(Operation.APPEND);
} }
@ -5250,15 +5268,20 @@ public class HRegion implements HeapSize { // , Writable{
// Lock row // Lock row
startRegionOperation(Operation.INCREMENT); startRegionOperation(Operation.INCREMENT);
this.writeRequestsCount.increment(); this.writeRequestsCount.increment();
RowLock rowLock = null;
WriteEntry w = null; WriteEntry w = null;
HLogKey walKey = null;
long mvccNum = 0;
List<KeyValue> memstoreCells = new ArrayList<KeyValue>();
boolean doRollBackMemstore = false;
try { try {
RowLock rowLock = getRowLock(row); rowLock = getRowLock(row);
try { try {
lock(this.updatesLock.readLock()); lock(this.updatesLock.readLock());
try { try {
// wait for all prior MVCC transactions to finish - while we hold the row lock // wait for all prior MVCC transactions to finish - while we hold the row lock
// (so that we are guaranteed to see the latest state) // (so that we are guaranteed to see the latest state)
mvcc.completeMemstoreInsert(mvcc.beginMemstoreInsert()); mvcc.waitForPreviousTransactionsComplete();
if (this.coprocessorHost != null) { if (this.coprocessorHost != null) {
Result r = this.coprocessorHost.preIncrementAfterRowLock(increment); Result r = this.coprocessorHost.preIncrementAfterRowLock(increment);
if (r != null) { if (r != null) {
@ -5266,7 +5289,8 @@ public class HRegion implements HeapSize { // , Writable{
} }
} }
// now start my own transaction // now start my own transaction
w = mvcc.beginMemstoreInsert(); mvccNum = MultiVersionConsistencyControl.getPreAssignedWriteNumber(this.sequenceId);
w = mvcc.beginMemstoreInsertWithSeqNum(mvccNum);
long now = EnvironmentEdgeManager.currentTimeMillis(); long now = EnvironmentEdgeManager.currentTimeMillis();
// Process each family // Process each family
for (Map.Entry<byte [], List<Cell>> family: for (Map.Entry<byte [], List<Cell>> family:
@ -5330,7 +5354,7 @@ public class HRegion implements HeapSize { // , Writable{
System.arraycopy(kv.getTagsArray(), kv.getTagsOffset(), newKV.getTagsArray(), System.arraycopy(kv.getTagsArray(), kv.getTagsOffset(), newKV.getTagsArray(),
newKV.getTagsOffset() + oldCellTagsLen, incCellTagsLen); newKV.getTagsOffset() + oldCellTagsLen, incCellTagsLen);
} }
newKV.setMvccVersion(w.getWriteNumber()); newKV.setMvccVersion(mvccNum);
// Give coprocessors a chance to update the new cell // Give coprocessors a chance to update the new cell
if (coprocessorHost != null) { if (coprocessorHost != null) {
newKV = KeyValueUtil.ensureKeyValue(coprocessorHost.postMutationBeforeWAL( newKV = KeyValueUtil.ensureKeyValue(coprocessorHost.postMutationBeforeWAL(
@ -5357,20 +5381,6 @@ public class HRegion implements HeapSize { // , Writable{
} }
} }
// Actually write to WAL now
if (walEdits != null && !walEdits.isEmpty()) {
if (writeToWAL) {
// Using default cluster id, as this can only happen in the originating
// cluster. A slave cluster receives the final value (not the delta)
// as a Put.
HLogKey key = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
this.htableDescriptor.getTableName(), nonceGroup, nonce);
txid = this.log.appendNoSync(this.htableDescriptor, this.getRegionInfo(),
key, walEdits, getSequenceId(), true);
} else {
recordMutationWithoutWal(increment.getFamilyCellMap());
}
}
//Actually write to Memstore now //Actually write to Memstore now
if (!tempMemstore.isEmpty()) { if (!tempMemstore.isEmpty()) {
for (Map.Entry<Store, List<Cell>> entry : tempMemstore.entrySet()) { for (Map.Entry<Store, List<Cell>> entry : tempMemstore.entrySet()) {
@ -5378,30 +5388,62 @@ public class HRegion implements HeapSize { // , Writable{
if (store.getFamily().getMaxVersions() == 1) { if (store.getFamily().getMaxVersions() == 1) {
// upsert if VERSIONS for this CF == 1 // upsert if VERSIONS for this CF == 1
size += store.upsert(entry.getValue(), getSmallestReadPoint()); size += store.upsert(entry.getValue(), getSmallestReadPoint());
memstoreCells.addAll(KeyValueUtil.ensureKeyValues(entry.getValue()));
} else { } else {
// otherwise keep older versions around // otherwise keep older versions around
for (Cell cell : entry.getValue()) { for (Cell cell : entry.getValue()) {
KeyValue kv = KeyValueUtil.ensureKeyValue(cell); KeyValue kv = KeyValueUtil.ensureKeyValue(cell);
size += store.add(kv); Pair<Long, Cell> ret = store.add(kv);
size += ret.getFirst();
memstoreCells.add(KeyValueUtil.ensureKeyValue(ret.getSecond()));
doRollBackMemstore = true;
} }
} }
} }
size = this.addAndGetGlobalMemstoreSize(size); size = this.addAndGetGlobalMemstoreSize(size);
flush = isFlushSize(size); flush = isFlushSize(size);
} }
// Actually write to WAL now
if (walEdits != null && !walEdits.isEmpty()) {
if (writeToWAL) {
// Using default cluster id, as this can only happen in the originating
// cluster. A slave cluster receives the final value (not the delta)
// as a Put.
walKey = new HLogKey(this.getRegionInfo().getEncodedNameAsBytes(),
this.htableDescriptor.getTableName(), HLog.NO_SEQUENCE_ID, nonceGroup, nonce);
txid = this.log.appendNoSync(this.htableDescriptor, this.getRegionInfo(),
walKey, walEdits, getSequenceId(), true, memstoreCells);
} else {
recordMutationWithoutWal(increment.getFamilyCellMap());
}
}
if(walKey == null){
// Append a faked WALEdit in order for SKIP_WAL updates to get mvccNum assigned
walKey = this.appendNoSyncNoAppend(this.log, memstoreCells);
}
} finally { } finally {
this.updatesLock.readLock().unlock(); this.updatesLock.readLock().unlock();
} }
} finally { } finally {
rowLock.release(); rowLock.release();
rowLock = null;
} }
if (writeToWAL && (walEdits != null) && !walEdits.isEmpty()) { // sync the transaction log outside the rowlock
// sync the transaction log outside the rowlock if(txid != 0){
syncOrDefer(txid, durability); syncOrDefer(txid, durability);
} }
doRollBackMemstore = false;
} finally { } finally {
if (rowLock != null) {
rowLock.release();
}
// if the wal sync was unsuccessful, remove keys from memstore
if (doRollBackMemstore) {
rollbackMemstore(memstoreCells);
}
if (w != null) { if (w != null) {
mvcc.completeMemstoreInsert(w); mvcc.completeMemstoreInsertWithSeqNum(w, walKey);
} }
closeRegionOperation(Operation.INCREMENT); closeRegionOperation(Operation.INCREMENT);
if (this.metricsRegion != null) { if (this.metricsRegion != null) {
@ -6130,4 +6172,23 @@ public class HRegion implements HeapSize { // , Writable{
} }
} }
} }
/**
* Append a faked WALEdit in order to get a long sequence number and log syncer will just ignore
* the WALEdit append later.
* @param wal
* @param cells list of KeyValues inserted into memstore. Those KeyValues are passed in order to
* be updated with right mvcc values(their log sequence nu
* @return
* @throws IOException
*/
private HLogKey appendNoSyncNoAppend(final HLog wal, List<KeyValue> cells) throws IOException {
HLogKey key = new HLogKey(getRegionInfo().getEncodedNameAsBytes(), getRegionInfo().getTable(),
HLog.NO_SEQUENCE_ID, 0, null, HConstants.NO_NONCE, HConstants.NO_NONCE);
// Call append but with an empty WALEdit. The returned seqeunce id will not be associated
// with any edit and we can be sure it went in after all outstanding appends.
wal.appendNoSync(getTableDesc(), getRegionInfo(), key,
WALEdit.EMPTY_WALEDIT, this.sequenceId, false, cells);
return key;
}
} }

View File

@ -84,6 +84,7 @@ import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.ChecksumType; import org.apache.hadoop.hbase.util.ChecksumType;
import org.apache.hadoop.hbase.util.ClassSize; import org.apache.hadoop.hbase.util.ClassSize;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.hbase.util.Pair;
import org.apache.hadoop.hbase.util.ReflectionUtils; import org.apache.hadoop.hbase.util.ReflectionUtils;
import org.apache.hadoop.util.StringUtils; import org.apache.hadoop.util.StringUtils;
@ -564,10 +565,10 @@ public class HStore implements Store {
} }
@Override @Override
public long add(final KeyValue kv) { public Pair<Long, Cell> add(final KeyValue kv) {
lock.readLock().lock(); lock.readLock().lock();
try { try {
return this.memstore.add(kv); return this.memstore.add(kv);
} finally { } finally {
lock.readLock().unlock(); lock.readLock().unlock();
} }

View File

@ -22,6 +22,7 @@ import java.util.List;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hbase.Cell; import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.io.HeapSize; import org.apache.hadoop.hbase.io.HeapSize;
import org.apache.hadoop.hbase.util.Pair;
/** /**
* The MemStore holds in-memory modifications to the Store. Modifications are {@link Cell}s. * The MemStore holds in-memory modifications to the Store. Modifications are {@link Cell}s.
@ -61,9 +62,10 @@ public interface MemStore extends HeapSize {
/** /**
* Write an update * Write an update
* @param cell * @param cell
* @return approximate size of the passed key and value. * @return approximate size of the passed KV and the newly added KV which maybe different from the
* passed in KV.
*/ */
long add(final Cell cell); Pair<Long, Cell> add(final Cell cell);
/** /**
* @return Oldest timestamp of all the Cells in the MemStore * @return Oldest timestamp of all the Cells in the MemStore

View File

@ -18,7 +18,9 @@
*/ */
package org.apache.hadoop.hbase.regionserver; package org.apache.hadoop.hbase.regionserver;
import java.io.IOException;
import java.util.LinkedList; import java.util.LinkedList;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Bytes;
@ -32,20 +34,18 @@ import org.apache.hadoop.hbase.util.ClassSize;
*/ */
@InterfaceAudience.Private @InterfaceAudience.Private
public class MultiVersionConsistencyControl { public class MultiVersionConsistencyControl {
private volatile long memstoreRead = 0; private static final long NO_WRITE_NUMBER = 0;
private volatile long memstoreWrite = 0; private volatile long memstoreRead = 0;
private final Object readWaiters = new Object(); private final Object readWaiters = new Object();
// This is the pending queue of writes. // This is the pending queue of writes.
private final LinkedList<WriteEntry> writeQueue = private final LinkedList<WriteEntry> writeQueue =
new LinkedList<WriteEntry>(); new LinkedList<WriteEntry>();
/** /**
* Default constructor. Initializes the memstoreRead/Write points to 0. * Default constructor. Initializes the memstoreRead/Write points to 0.
*/ */
public MultiVersionConsistencyControl() { public MultiVersionConsistencyControl() {
this.memstoreRead = this.memstoreWrite = 0;
} }
/** /**
@ -54,37 +54,86 @@ public class MultiVersionConsistencyControl {
*/ */
public void initialize(long startPoint) { public void initialize(long startPoint) {
synchronized (writeQueue) { synchronized (writeQueue) {
if (this.memstoreWrite != this.memstoreRead) { writeQueue.clear();
throw new RuntimeException("Already used this mvcc. Too late to initialize"); memstoreRead = startPoint;
}
this.memstoreRead = this.memstoreWrite = startPoint;
} }
} }
/** /**
* Generate and return a {@link WriteEntry} with a new write number. *
* To complete the WriteEntry and wait for it to be visible, * @param initVal The value we used initially and expected it'll be reset later
* call {@link #completeMemstoreInsert(WriteEntry)}. * @return
*/ */
public WriteEntry beginMemstoreInsert() { WriteEntry beginMemstoreInsert() {
return beginMemstoreInsertWithSeqNum(NO_WRITE_NUMBER);
}
/**
* Get a mvcc write number before an actual one(its log sequence Id) being assigned
* @param sequenceId
* @return long a faked write number which is bigger enough not to be seen by others before a real
* one is assigned
*/
public static long getPreAssignedWriteNumber(AtomicLong sequenceId) {
// the 1 billion is just an arbitrary big number to guard no scanner will reach it before
// current MVCC completes. Theoretically the bump only needs to be 2 * the number of handlers
// because each handler could increment sequence num twice and max concurrent in-flight
// transactions is the number of RPC handlers.
// we can't use Long.MAX_VALUE because we still want to maintain the ordering when multiple
// changes touch same row key
// If for any reason, the bumped value isn't reset due to failure situations, we'll reset
// curSeqNum to NO_WRITE_NUMBER in order NOT to advance memstore read point at all
return sequenceId.incrementAndGet() + 1000000000;
}
/**
* This function starts a MVCC transaction with current region's log change sequence number. Since
* we set change sequence number when flushing current change to WAL(late binding), the flush
* order may differ from the order to start a MVCC transaction. For example, a change begins a
* MVCC firstly may complete later than a change which starts MVCC at a later time. Therefore, we
* add a safe bumper to the passed in sequence number to start a MVCC so that no other concurrent
* transactions will reuse the number till current MVCC completes(success or fail). The "faked"
* big number is safe because we only need it to prevent current change being seen and the number
* will be reset to real sequence number(set in log sync) right before we complete a MVCC in order
* for MVCC to align with flush sequence.
* @param curSeqNum
* @return WriteEntry a WriteEntry instance with the passed in curSeqNum
*/
public WriteEntry beginMemstoreInsertWithSeqNum(long curSeqNum) {
WriteEntry e = new WriteEntry(curSeqNum);
synchronized (writeQueue) { synchronized (writeQueue) {
long nextWriteNumber = ++memstoreWrite;
WriteEntry e = new WriteEntry(nextWriteNumber);
writeQueue.add(e); writeQueue.add(e);
return e; return e;
} }
} }
/** /**
* Complete a {@link WriteEntry} that was created by {@link #beginMemstoreInsert()}. * Complete a {@link WriteEntry} that was created by
* * {@link #beginMemstoreInsertWithSeqNum(long)}. At the end of this call, the global read
* At the end of this call, the global read point is at least as large as the write point * point is at least as large as the write point of the passed in WriteEntry. Thus, the write is
* of the passed in WriteEntry. Thus, the write is visible to MVCC readers. * visible to MVCC readers.
* @throws IOException
*/
public void completeMemstoreInsertWithSeqNum(WriteEntry e, SequenceNumber seqNum)
throws IOException {
if(e == null) return;
if (seqNum != null) {
e.setWriteNumber(seqNum.getSequenceNumber());
} else {
// set the value to NO_WRITE_NUMBER in order NOT to advance memstore readpoint inside
// function beginMemstoreInsertWithSeqNum in case of failures
e.setWriteNumber(NO_WRITE_NUMBER);
}
waitForPreviousTransactionsComplete(e);
}
/**
* Complete a {@link WriteEntry} that was created by {@link #beginMemstoreInsert()}. At the
* end of this call, the global read point is at least as large as the write point of the passed
* in WriteEntry. Thus, the write is visible to MVCC readers.
*/ */
public void completeMemstoreInsert(WriteEntry e) { public void completeMemstoreInsert(WriteEntry e) {
advanceMemstore(e); waitForPreviousTransactionsComplete(e);
waitForRead(e);
} }
/** /**
@ -99,75 +148,94 @@ public class MultiVersionConsistencyControl {
* @return true if e is visible to MVCC readers (that is, readpoint >= e.writeNumber) * @return true if e is visible to MVCC readers (that is, readpoint >= e.writeNumber)
*/ */
boolean advanceMemstore(WriteEntry e) { boolean advanceMemstore(WriteEntry e) {
long nextReadValue = -1;
synchronized (writeQueue) { synchronized (writeQueue) {
e.markCompleted(); e.markCompleted();
long nextReadValue = -1;
boolean ranOnce=false;
while (!writeQueue.isEmpty()) { while (!writeQueue.isEmpty()) {
ranOnce=true;
WriteEntry queueFirst = writeQueue.getFirst(); WriteEntry queueFirst = writeQueue.getFirst();
if (nextReadValue > 0) {
if (nextReadValue+1 != queueFirst.getWriteNumber()) {
throw new RuntimeException("invariant in completeMemstoreInsert violated, prev: "
+ nextReadValue + " next: " + queueFirst.getWriteNumber());
}
}
if (queueFirst.isCompleted()) { if (queueFirst.isCompleted()) {
nextReadValue = queueFirst.getWriteNumber(); // Using Max because Edit complete in WAL sync order not arriving order
nextReadValue = Math.max(nextReadValue, queueFirst.getWriteNumber());
writeQueue.removeFirst(); writeQueue.removeFirst();
} else { } else {
break; break;
} }
} }
if (!ranOnce) { if (nextReadValue > memstoreRead) {
throw new RuntimeException("never was a first"); memstoreRead = nextReadValue;
} }
if (nextReadValue > 0) { // notify waiters on writeQueue before return
synchronized (readWaiters) { writeQueue.notifyAll();
memstoreRead = nextReadValue;
readWaiters.notifyAll();
}
}
if (memstoreRead >= e.getWriteNumber()) {
return true;
}
return false;
} }
if (nextReadValue > 0) {
synchronized (readWaiters) {
readWaiters.notifyAll();
}
}
if (memstoreRead >= e.getWriteNumber()) {
return true;
}
return false;
} }
/** /**
* Wait for the global readPoint to advance upto * Wait for all previous MVCC transactions complete
* the specified transaction number.
*/ */
public void waitForRead(WriteEntry e) { public void waitForPreviousTransactionsComplete() {
WriteEntry w = beginMemstoreInsert();
waitForPreviousTransactionsComplete(w);
}
public void waitForPreviousTransactionsComplete(WriteEntry waitedEntry) {
boolean interrupted = false; boolean interrupted = false;
synchronized (readWaiters) { WriteEntry w = waitedEntry;
while (memstoreRead < e.getWriteNumber()) {
try { try {
readWaiters.wait(0); WriteEntry firstEntry = null;
} catch (InterruptedException ie) { do {
// We were interrupted... finish the loop -- i.e. cleanup --and then synchronized (writeQueue) {
// on our way out, reset the interrupt flag. // writeQueue won't be empty at this point, the following is just a safety check
interrupted = true; if (writeQueue.isEmpty()) {
break;
}
firstEntry = writeQueue.getFirst();
if (firstEntry == w) {
// all previous in-flight transactions are done
break;
}
try {
writeQueue.wait(0);
} catch (InterruptedException ie) {
// We were interrupted... finish the loop -- i.e. cleanup --and then
// on our way out, reset the interrupt flag.
interrupted = true;
break;
}
} }
} while (firstEntry != null);
} finally {
if (w != null) {
advanceMemstore(w);
} }
} }
if (interrupted) Thread.currentThread().interrupt(); if (interrupted) {
Thread.currentThread().interrupt();
}
} }
public long memstoreReadPoint() { public long memstoreReadPoint() {
return memstoreRead; return memstoreRead;
} }
public static class WriteEntry { public static class WriteEntry {
private long writeNumber; private long writeNumber;
private boolean completed = false; private boolean completed = false;
WriteEntry(long writeNumber) { WriteEntry(long writeNumber) {
this.writeNumber = writeNumber; this.writeNumber = writeNumber;
} }
@ -180,6 +248,9 @@ public class MultiVersionConsistencyControl {
long getWriteNumber() { long getWriteNumber() {
return this.writeNumber; return this.writeNumber;
} }
void setWriteNumber(long val){
this.writeNumber = val;
}
} }
public static final long FIXED_SIZE = ClassSize.align( public static final long FIXED_SIZE = ClassSize.align(

View File

@ -0,0 +1,31 @@
/**
*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.hbase.regionserver;
import java.io.IOException;
import org.apache.hadoop.classification.InterfaceAudience;
/**
* Interface which abstracts implementations on log sequence number assignment
*/
@InterfaceAudience.Private
public interface SequenceNumber {
public long getSequenceNumber() throws IOException;
}

View File

@ -40,6 +40,7 @@ import org.apache.hadoop.hbase.protobuf.generated.WALProtos.CompactionDescriptor
import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext; import org.apache.hadoop.hbase.regionserver.compactions.CompactionContext;
import org.apache.hadoop.hbase.regionserver.compactions.CompactionProgress; import org.apache.hadoop.hbase.regionserver.compactions.CompactionProgress;
import org.apache.hadoop.hbase.regionserver.compactions.CompactionRequest; import org.apache.hadoop.hbase.regionserver.compactions.CompactionRequest;
import org.apache.hadoop.hbase.util.Pair;
/** /**
* Interface for objects that hold a column family in a Region. Its a memstore and a set of zero or * Interface for objects that hold a column family in a Region. Its a memstore and a set of zero or
@ -122,9 +123,9 @@ public interface Store extends HeapSize, StoreConfigInformation {
/** /**
* Adds a value to the memstore * Adds a value to the memstore
* @param kv * @param kv
* @return memstore size delta * @return memstore size delta & newly added KV which maybe different than the passed in KV
*/ */
long add(KeyValue kv); Pair<Long, Cell> add(KeyValue kv);
/** /**
* When was the last edit done in the memstore * When was the last edit done in the memstore

View File

@ -121,12 +121,6 @@ abstract class StoreFlusher {
// set its memstoreTS to 0. This will help us save space when writing to // set its memstoreTS to 0. This will help us save space when writing to
// disk. // disk.
KeyValue kv = KeyValueUtil.ensureKeyValue(c); KeyValue kv = KeyValueUtil.ensureKeyValue(c);
if (kv.getMvccVersion() <= smallestReadPoint) {
// let us not change the original KV. It could be in the memstore
// changing its memstoreTS could affect other threads/scanners.
kv = kv.shallowCopy();
kv.setMvccVersion(0);
}
sink.append(kv); sink.append(kv);
} }
kvs.clear(); kvs.clear();

View File

@ -1064,13 +1064,26 @@ class FSHLog implements HLog, Syncable {
} }
} }
/**
* @param now
* @param encodedRegionName Encoded name of the region as returned by
* <code>HRegionInfo#getEncodedNameAsBytes()</code>.
* @param tableName
* @param clusterIds that have consumed the change
* @return New log key.
*/
protected HLogKey makeKey(byte[] encodedRegionName, TableName tableName, long seqnum,
long now, List<UUID> clusterIds, long nonceGroup, long nonce) {
return new HLogKey(encodedRegionName, tableName, seqnum, now, clusterIds, nonceGroup, nonce);
}
@Override @Override
@VisibleForTesting @VisibleForTesting
public void append(HRegionInfo info, TableName tableName, WALEdit edits, public void append(HRegionInfo info, TableName tableName, WALEdit edits,
final long now, HTableDescriptor htd, AtomicLong sequenceId) final long now, HTableDescriptor htd, AtomicLong sequenceId)
throws IOException { throws IOException {
HLogKey logKey = new HLogKey(info.getEncodedNameAsBytes(), tableName, now); HLogKey logKey = new HLogKey(info.getEncodedNameAsBytes(), tableName, now);
append(htd, info, logKey, edits, sequenceId, true, true); append(htd, info, logKey, edits, sequenceId, true, true, null);
} }
@Override @Override
@ -1079,14 +1092,15 @@ class FSHLog implements HLog, Syncable {
boolean inMemstore, long nonceGroup, long nonce) throws IOException { boolean inMemstore, long nonceGroup, long nonce) throws IOException {
HLogKey logKey = HLogKey logKey =
new HLogKey(info.getEncodedNameAsBytes(), tableName, now, clusterIds, nonceGroup, nonce); new HLogKey(info.getEncodedNameAsBytes(), tableName, now, clusterIds, nonceGroup, nonce);
return append(htd, info, logKey, edits, sequenceId, false, inMemstore); return append(htd, info, logKey, edits, sequenceId, false, inMemstore, null);
} }
@Override @Override
public long appendNoSync(final HTableDescriptor htd, final HRegionInfo info, final HLogKey key, public long appendNoSync(final HTableDescriptor htd, final HRegionInfo info, final HLogKey key,
final WALEdit edits, final AtomicLong sequenceId, final boolean inMemstore) final WALEdit edits, final AtomicLong sequenceId, final boolean inMemstore,
final List<KeyValue> memstoreKVs)
throws IOException { throws IOException {
return append(htd, info, key, edits, sequenceId, false, inMemstore); return append(htd, info, key, edits, sequenceId, false, inMemstore, memstoreKVs);
} }
/** /**
@ -1101,19 +1115,22 @@ class FSHLog implements HLog, Syncable {
* @param sync shall we sync after we call the append? * @param sync shall we sync after we call the append?
* @param inMemstore * @param inMemstore
* @param sequenceId The region sequence id reference. * @param sequenceId The region sequence id reference.
* @param memstoreKVs
* @return txid of this transaction or if nothing to do, the last txid * @return txid of this transaction or if nothing to do, the last txid
* @throws IOException * @throws IOException
*/ */
@edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NP_NULL_ON_SOME_PATH_EXCEPTION", @edu.umd.cs.findbugs.annotations.SuppressWarnings(value="NP_NULL_ON_SOME_PATH_EXCEPTION",
justification="Will never be null") justification="Will never be null")
private long append(HTableDescriptor htd, final HRegionInfo hri, final HLogKey key, private long append(HTableDescriptor htd, final HRegionInfo hri, final HLogKey key,
WALEdit edits, AtomicLong sequenceId, boolean sync, boolean inMemstore) WALEdit edits, AtomicLong sequenceId, boolean sync, boolean inMemstore,
List<KeyValue> memstoreKVs)
throws IOException { throws IOException {
if (!this.enabled) return this.highestUnsyncedSequence; if (!this.enabled) return this.highestUnsyncedSequence;
if (this.closed) throw new IOException("Cannot append; log is closed"); if (this.closed) throw new IOException("Cannot append; log is closed");
// Make a trace scope for the append. It is closed on other side of the ring buffer by the // Make a trace scope for the append. It is closed on other side of the ring buffer by the
// single consuming thread. Don't have to worry about it. // single consuming thread. Don't have to worry about it.
TraceScope scope = Trace.startSpan("FSHLog.append"); TraceScope scope = Trace.startSpan("FSHLog.append");
// This is crazy how much it takes to make an edit. Do we need all this stuff!!!!???? We need // This is crazy how much it takes to make an edit. Do we need all this stuff!!!!???? We need
// all this to make a key and then below to append the edit, we need to carry htd, info, // all this to make a key and then below to append the edit, we need to carry htd, info,
// etc. all over the ring buffer. // etc. all over the ring buffer.
@ -1124,19 +1141,10 @@ class FSHLog implements HLog, Syncable {
// Construction of FSWALEntry sets a latch. The latch is thrown just after we stamp the // Construction of FSWALEntry sets a latch. The latch is thrown just after we stamp the
// edit with its edit/sequence id. The below entry.getRegionSequenceId will wait on the // edit with its edit/sequence id. The below entry.getRegionSequenceId will wait on the
// latch to be thrown. TODO: reuse FSWALEntry as we do SyncFuture rather create per append. // latch to be thrown. TODO: reuse FSWALEntry as we do SyncFuture rather create per append.
entry = new FSWALEntry(sequence, key, edits, sequenceId, inMemstore, htd, hri); entry = new FSWALEntry(sequence, key, edits, sequenceId, inMemstore, htd, hri, memstoreKVs);
truck.loadPayload(entry, scope.detach()); truck.loadPayload(entry, scope.detach());
} finally { } finally {
this.disruptor.getRingBuffer().publish(sequence); this.disruptor.getRingBuffer().publish(sequence);
// Now wait until the region edit/sequence id is available. The 'entry' has an internal
// latch that is thrown when the region edit/sequence id is set. Calling
// entry.getRegionSequenceId will cause us block until the latch is thrown. The return is
// the region edit/sequence id, not the ring buffer txid.
try {
entry.getRegionSequenceId();
} catch (InterruptedException e) {
throw convertInterruptedExceptionToIOException(e);
}
} }
// doSync is set in tests. Usually we arrive in here via appendNoSync w/ the sync called after // doSync is set in tests. Usually we arrive in here via appendNoSync w/ the sync called after
// all edits on a handler have been added. // all edits on a handler have been added.
@ -1894,6 +1902,14 @@ class FSHLog implements HLog, Syncable {
// here inside this single appending/writing thread. Events are ordered on the ringbuffer // here inside this single appending/writing thread. Events are ordered on the ringbuffer
// so region sequenceids will also be in order. // so region sequenceids will also be in order.
regionSequenceId = entry.stampRegionSequenceId(); regionSequenceId = entry.stampRegionSequenceId();
// Edits are empty, there is nothing to append. Maybe empty when we are looking for a
// region sequence id only, a region edit/sequence id that is not associated with an actual
// edit. It has to go through all the rigmarole to be sure we have the right ordering.
if (entry.getEdit().isEmpty()) {
return;
}
// Coprocessor hook. // Coprocessor hook.
if (!coprocessorHost.preWALWrite(entry.getHRegionInfo(), entry.getKey(), if (!coprocessorHost.preWALWrite(entry.getHRegionInfo(), entry.getKey(),
entry.getEdit())) { entry.getEdit())) {
@ -1909,19 +1925,16 @@ class FSHLog implements HLog, Syncable {
entry.getEdit()); entry.getEdit());
} }
} }
// If empty, there is nothing to append. Maybe empty when we are looking for a region
// sequence id only, a region edit/sequence id that is not associated with an actual edit. writer.append(entry);
// It has to go through all the rigmarole to be sure we have the right ordering. assert highestUnsyncedSequence < entry.getSequence();
if (!entry.getEdit().isEmpty()) { highestUnsyncedSequence = entry.getSequence();
writer.append(entry); Long lRegionSequenceId = Long.valueOf(regionSequenceId);
assert highestUnsyncedSequence < entry.getSequence(); highestRegionSequenceIds.put(encodedRegionName, lRegionSequenceId);
highestUnsyncedSequence = entry.getSequence(); if (entry.isInMemstore()) {
Long lRegionSequenceId = Long.valueOf(regionSequenceId); oldestUnflushedRegionSequenceIds.putIfAbsent(encodedRegionName, lRegionSequenceId);
highestRegionSequenceIds.put(encodedRegionName, lRegionSequenceId);
if (entry.isInMemstore()) {
oldestUnflushedRegionSequenceIds.putIfAbsent(encodedRegionName, lRegionSequenceId);
}
} }
coprocessorHost.postWALWrite(entry.getHRegionInfo(), entry.getKey(), entry.getEdit()); coprocessorHost.postWALWrite(entry.getHRegionInfo(), entry.getKey(), entry.getEdit());
// Update metrics. // Update metrics.
postAppend(entry, EnvironmentEdgeManager.currentTimeMillis() - start); postAppend(entry, EnvironmentEdgeManager.currentTimeMillis() - start);

View File

@ -17,12 +17,14 @@
*/ */
package org.apache.hadoop.hbase.regionserver.wal; package org.apache.hadoop.hbase.regionserver.wal;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.atomic.AtomicLong;
import java.util.List;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.hadoop.classification.InterfaceAudience; import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.KeyValue;
/** /**
* A WAL Entry for {@link FSHLog} implementation. Immutable. * A WAL Entry for {@link FSHLog} implementation. Immutable.
@ -41,19 +43,18 @@ class FSWALEntry extends HLog.Entry {
private final transient boolean inMemstore; private final transient boolean inMemstore;
private final transient HTableDescriptor htd; private final transient HTableDescriptor htd;
private final transient HRegionInfo hri; private final transient HRegionInfo hri;
// Latch that is set on creation and then is undone on the other side of the ring buffer by the private final transient List<KeyValue> memstoreKVs;
// consumer thread just after it sets the region edit/sequence id in here.
private final transient CountDownLatch latch = new CountDownLatch(1);
FSWALEntry(final long sequence, final HLogKey key, final WALEdit edit, FSWALEntry(final long sequence, final HLogKey key, final WALEdit edit,
final AtomicLong referenceToRegionSequenceId, final boolean inMemstore, final AtomicLong referenceToRegionSequenceId, final boolean inMemstore,
final HTableDescriptor htd, final HRegionInfo hri) { final HTableDescriptor htd, final HRegionInfo hri, List<KeyValue> memstoreKVs) {
super(key, edit); super(key, edit);
this.regionSequenceIdReference = referenceToRegionSequenceId; this.regionSequenceIdReference = referenceToRegionSequenceId;
this.inMemstore = inMemstore; this.inMemstore = inMemstore;
this.htd = htd; this.htd = htd;
this.hri = hri; this.hri = hri;
this.sequence = sequence; this.sequence = sequence;
this.memstoreKVs = memstoreKVs;
} }
public String toString() { public String toString() {
@ -90,15 +91,13 @@ class FSWALEntry extends HLog.Entry {
*/ */
long stampRegionSequenceId() { long stampRegionSequenceId() {
long regionSequenceId = this.regionSequenceIdReference.incrementAndGet(); long regionSequenceId = this.regionSequenceIdReference.incrementAndGet();
getKey().setLogSeqNum(regionSequenceId); if(memstoreKVs != null && !memstoreKVs.isEmpty()) {
// On creation, a latch was set. Count it down when sequence id is set. This will free for(KeyValue kv : this.memstoreKVs){
// up anyone blocked on {@link #getRegionSequenceId()} kv.setMvccVersion(regionSequenceId);
this.latch.countDown(); }
}
HLogKey key = getKey();
key.setLogSeqNum(regionSequenceId);
return regionSequenceId; return regionSequenceId;
} }
}
long getRegionSequenceId() throws InterruptedException {
this.latch.await();
return getKey().getLogSeqNum();
}
}

View File

@ -34,8 +34,10 @@ import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HRegionInfo;
import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.HTableDescriptor;
import org.apache.hadoop.hbase.KeyValue;
import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.protobuf.generated.WALProtos.WALTrailer; import org.apache.hadoop.hbase.protobuf.generated.WALProtos.WALTrailer;
import org.apache.hadoop.io.Writable; import org.apache.hadoop.io.Writable;
@ -290,8 +292,8 @@ public interface HLog {
* @param sequenceId * @param sequenceId
* @throws IOException * @throws IOException
* @deprecated For tests only and even then, should use * @deprecated For tests only and even then, should use
* {@link #appendNoSync(HTableDescriptor, HRegionInfo, HLogKey, WALEdit, AtomicLong, boolean)} * {@link #appendNoSync(HTableDescriptor, HRegionInfo, HLogKey, WALEdit, AtomicLong, boolean,
* and {@link #sync()} instead. * List)} and {@link #sync()} instead.
*/ */
@VisibleForTesting @VisibleForTesting
public void append(HRegionInfo info, TableName tableName, WALEdit edits, public void append(HRegionInfo info, TableName tableName, WALEdit edits,
@ -337,7 +339,7 @@ public interface HLog {
* able to sync an explicit edit only (the current default implementation syncs up to the time * able to sync an explicit edit only (the current default implementation syncs up to the time
* of the sync call syncing whatever is behind the sync). * of the sync call syncing whatever is behind the sync).
* @throws IOException * @throws IOException
* @deprecated Use {@link #appendNoSync(HTableDescriptor, HRegionInfo, HLogKey, WALEdit, AtomicLong, boolean)} * @deprecated Use {@link #appendNoSync(HTableDescriptor, HRegionInfo, HLogKey, WALEdit, AtomicLong, boolean, List)}
* instead because you can get back the region edit/sequenceid; it is set into the passed in * instead because you can get back the region edit/sequenceid; it is set into the passed in
* <code>key</code>. * <code>key</code>.
*/ */
@ -361,12 +363,13 @@ public interface HLog {
* @param inMemstore Always true except for case where we are writing a compaction completion * @param inMemstore Always true except for case where we are writing a compaction completion
* record into the WAL; in this case the entry is just so we can finish an unfinished compaction * record into the WAL; in this case the entry is just so we can finish an unfinished compaction
* -- it is not an edit for memstore. * -- it is not an edit for memstore.
* @param memstoreKVs list of KVs added into memstore
* @return Returns a 'transaction id' and <code>key</code> will have the region edit/sequence id * @return Returns a 'transaction id' and <code>key</code> will have the region edit/sequence id
* in it. * in it.
* @throws IOException * @throws IOException
*/ */
long appendNoSync(HTableDescriptor htd, HRegionInfo info, HLogKey key, WALEdit edits, long appendNoSync(HTableDescriptor htd, HRegionInfo info, HLogKey key, WALEdit edits,
AtomicLong sequenceId, boolean inMemstore) AtomicLong sequenceId, boolean inMemstore, List<KeyValue> memstoreKVs)
throws IOException; throws IOException;
// TODO: Do we need all these versions of sync? // TODO: Do we need all these versions of sync?

View File

@ -22,6 +22,7 @@ import java.io.DataInput;
import java.io.DataOutput; import java.io.DataOutput;
import java.io.EOFException; import java.io.EOFException;
import java.io.IOException; import java.io.IOException;
import java.io.InterruptedIOException;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.Collections; import java.util.Collections;
import java.util.HashMap; import java.util.HashMap;
@ -31,6 +32,10 @@ import java.util.Map;
import java.util.NavigableMap; import java.util.NavigableMap;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.UUID; import java.util.UUID;
import java.util.concurrent.CountDownLatch;
import com.google.protobuf.HBaseZeroCopyByteString;
import org.apache.commons.logging.Log; import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory; import org.apache.commons.logging.LogFactory;
@ -42,6 +47,7 @@ import org.apache.hadoop.hbase.protobuf.generated.HBaseProtos;
import org.apache.hadoop.hbase.protobuf.generated.WALProtos.FamilyScope; import org.apache.hadoop.hbase.protobuf.generated.WALProtos.FamilyScope;
import org.apache.hadoop.hbase.protobuf.generated.WALProtos.ScopeType; import org.apache.hadoop.hbase.protobuf.generated.WALProtos.ScopeType;
import org.apache.hadoop.hbase.protobuf.generated.WALProtos.WALKey; import org.apache.hadoop.hbase.protobuf.generated.WALProtos.WALKey;
import org.apache.hadoop.hbase.regionserver.SequenceNumber;
import org.apache.hadoop.hbase.util.Bytes; import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager; import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparable;
@ -49,7 +55,6 @@ import org.apache.hadoop.io.WritableUtils;
import com.google.common.annotations.VisibleForTesting; import com.google.common.annotations.VisibleForTesting;
import com.google.protobuf.ByteString; import com.google.protobuf.ByteString;
import com.google.protobuf.HBaseZeroCopyByteString;
/** /**
* A Key for an entry in the change log. * A Key for an entry in the change log.
@ -64,7 +69,7 @@ import com.google.protobuf.HBaseZeroCopyByteString;
// TODO: Key and WALEdit are never used separately, or in one-to-many relation, for practical // TODO: Key and WALEdit are never used separately, or in one-to-many relation, for practical
// purposes. They need to be merged into HLogEntry. // purposes. They need to be merged into HLogEntry.
@InterfaceAudience.Private @InterfaceAudience.Private
public class HLogKey implements WritableComparable<HLogKey> { public class HLogKey implements WritableComparable<HLogKey>, SequenceNumber {
public static final Log LOG = LogFactory.getLog(HLogKey.class); public static final Log LOG = LogFactory.getLog(HLogKey.class);
// should be < 0 (@see #readFields(DataInput)) // should be < 0 (@see #readFields(DataInput))
@ -114,6 +119,7 @@ public class HLogKey implements WritableComparable<HLogKey> {
private byte [] encodedRegionName; private byte [] encodedRegionName;
private TableName tablename; private TableName tablename;
private long logSeqNum; private long logSeqNum;
private CountDownLatch seqNumAssignedLatch = new CountDownLatch(1);
// Time at which this edit was written. // Time at which this edit was written.
private long writeTime; private long writeTime;
@ -184,7 +190,8 @@ public class HLogKey implements WritableComparable<HLogKey> {
*/ */
public HLogKey(final byte [] encodedRegionName, final TableName tablename, public HLogKey(final byte [] encodedRegionName, final TableName tablename,
final long now, List<UUID> clusterIds, long nonceGroup, long nonce) { final long now, List<UUID> clusterIds, long nonceGroup, long nonce) {
init(encodedRegionName, tablename, HLog.NO_SEQUENCE_ID, now, clusterIds, nonceGroup, nonce); init(encodedRegionName, tablename, HLog.NO_SEQUENCE_ID, now, clusterIds,
nonceGroup, nonce);
} }
/** /**
@ -195,13 +202,14 @@ public class HLogKey implements WritableComparable<HLogKey> {
* @param encodedRegionName Encoded name of the region as returned by * @param encodedRegionName Encoded name of the region as returned by
* <code>HRegionInfo#getEncodedNameAsBytes()</code>. * <code>HRegionInfo#getEncodedNameAsBytes()</code>.
* @param tablename * @param tablename
* @param logSeqNum
* @param nonceGroup * @param nonceGroup
* @param nonce * @param nonce
*/ */
public HLogKey(final byte [] encodedRegionName, final TableName tablename, long nonceGroup, public HLogKey(final byte [] encodedRegionName, final TableName tablename, long logSeqNum,
long nonce) { long nonceGroup, long nonce) {
init(encodedRegionName, tablename, HLog.NO_SEQUENCE_ID, init(encodedRegionName, tablename, logSeqNum, EnvironmentEdgeManager.currentTimeMillis(),
EnvironmentEdgeManager.currentTimeMillis(), EMPTY_UUIDS, nonceGroup, nonce); EMPTY_UUIDS, nonceGroup, nonce);
} }
protected void init(final byte [] encodedRegionName, final TableName tablename, protected void init(final byte [] encodedRegionName, final TableName tablename,
@ -238,11 +246,30 @@ public class HLogKey implements WritableComparable<HLogKey> {
} }
/** /**
* Allow that the log sequence id to be set post-construction. * Allow that the log sequence id to be set post-construction and release all waiters on assigned
* sequence number.
* @param sequence * @param sequence
*/ */
void setLogSeqNum(final long sequence) { void setLogSeqNum(final long sequence) {
this.logSeqNum = sequence; this.logSeqNum = sequence;
this.seqNumAssignedLatch.countDown();
}
/**
* Wait for sequence number is assigned & return the assigned value
* @return long the new assigned sequence number
* @throws InterruptedException
*/
public long getSequenceNumber() throws IOException {
try {
this.seqNumAssignedLatch.await();
} catch (InterruptedException ie) {
LOG.warn("Thread interrupted waiting for next log sequence number");
InterruptedIOException iie = new InterruptedIOException();
iie.initCause(ie);
throw iie;
}
return this.logSeqNum;
} }
/** /**
@ -358,7 +385,7 @@ public class HLogKey implements WritableComparable<HLogKey> {
if (result == 0) { if (result == 0) {
if (this.logSeqNum < o.logSeqNum) { if (this.logSeqNum < o.logSeqNum) {
result = -1; result = -1;
} else if (this.logSeqNum > o.logSeqNum ) { } else if (this.logSeqNum > o.logSeqNum) {
result = 1; result = 1;
} }
if (result == 0) { if (result == 0) {

View File

@ -1972,8 +1972,8 @@ public class HLogSplitter {
clusterIds.add(new UUID(uuid.getMostSigBits(), uuid.getLeastSigBits())); clusterIds.add(new UUID(uuid.getMostSigBits(), uuid.getLeastSigBits()));
} }
key = new HLogKey(walKey.getEncodedRegionName().toByteArray(), TableName.valueOf(walKey key = new HLogKey(walKey.getEncodedRegionName().toByteArray(), TableName.valueOf(walKey
.getTableName().toByteArray()), walKey.getLogSequenceNumber(), walKey.getWriteTime(), .getTableName().toByteArray()), walKey.getLogSequenceNumber(),
clusterIds, walKey.getNonceGroup(), walKey.getNonce()); walKey.getWriteTime(), clusterIds, walKey.getNonceGroup(), walKey.getNonce());
logEntry.setFirst(key); logEntry.setFirst(key);
logEntry.setSecond(val); logEntry.setSecond(val);
} }

View File

@ -262,7 +262,7 @@ public class HLogUtil {
final CompactionDescriptor c, AtomicLong sequenceId) throws IOException { final CompactionDescriptor c, AtomicLong sequenceId) throws IOException {
TableName tn = TableName.valueOf(c.getTableName().toByteArray()); TableName tn = TableName.valueOf(c.getTableName().toByteArray());
HLogKey key = new HLogKey(info.getEncodedNameAsBytes(), tn); HLogKey key = new HLogKey(info.getEncodedNameAsBytes(), tn);
log.appendNoSync(htd, info, key, WALEdit.createCompaction(c), sequenceId, false); log.appendNoSync(htd, info, key, WALEdit.createCompaction(c), sequenceId, false, null);
log.sync(); log.sync();
if (LOG.isTraceEnabled()) { if (LOG.isTraceEnabled()) {
LOG.trace("Appended compaction marker " + TextFormat.shortDebugString(c)); LOG.trace("Appended compaction marker " + TextFormat.shortDebugString(c));

View File

@ -249,7 +249,7 @@ public class WALEdit implements Writable, HeapSize {
sb.append(">]"); sb.append(">]");
return sb.toString(); return sb.toString();
} }
/** /**
* Create a compacion WALEdit * Create a compacion WALEdit
* @param c * @param c

View File

@ -68,7 +68,7 @@ public class TestMultiParallel {
private static final byte[] ONE_ROW = Bytes.toBytes("xxx"); private static final byte[] ONE_ROW = Bytes.toBytes("xxx");
private static final byte [][] KEYS = makeKeys(); private static final byte [][] KEYS = makeKeys();
private static final int slaves = 2; // also used for testing HTable pool size private static final int slaves = 3; // also used for testing HTable pool size
@BeforeClass public static void beforeClass() throws Exception { @BeforeClass public static void beforeClass() throws Exception {
((Log4JLogger)RpcServer.LOG).getLogger().setLevel(Level.ALL); ((Log4JLogger)RpcServer.LOG).getLogger().setLevel(Level.ALL);
@ -238,7 +238,7 @@ public class TestMultiParallel {
* *
* @throws Exception * @throws Exception
*/ */
@Test (timeout=300000) @Test (timeout=360000)
public void testFlushCommitsWithAbort() throws Exception { public void testFlushCommitsWithAbort() throws Exception {
LOG.info("test=testFlushCommitsWithAbort"); LOG.info("test=testFlushCommitsWithAbort");
doTestFlushCommits(true); doTestFlushCommits(true);

View File

@ -25,6 +25,7 @@ import java.util.ArrayList;
import java.util.Arrays; import java.util.Arrays;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import java.util.concurrent.atomic.AtomicReference; import java.util.concurrent.atomic.AtomicReference;
import junit.framework.TestCase; import junit.framework.TestCase;
@ -61,6 +62,7 @@ public class TestDefaultMemStore extends TestCase {
private static final int QUALIFIER_COUNT = ROW_COUNT; private static final int QUALIFIER_COUNT = ROW_COUNT;
private static final byte [] FAMILY = Bytes.toBytes("column"); private static final byte [] FAMILY = Bytes.toBytes("column");
private MultiVersionConsistencyControl mvcc; private MultiVersionConsistencyControl mvcc;
private AtomicLong startSeqNum = new AtomicLong(0);
@Override @Override
public void setUp() throws Exception { public void setUp() throws Exception {
@ -236,7 +238,7 @@ public class TestDefaultMemStore extends TestCase {
final byte[] v = Bytes.toBytes("value"); final byte[] v = Bytes.toBytes("value");
MultiVersionConsistencyControl.WriteEntry w = MultiVersionConsistencyControl.WriteEntry w =
mvcc.beginMemstoreInsert(); mvcc.beginMemstoreInsertWithSeqNum(this.startSeqNum.incrementAndGet());
KeyValue kv1 = new KeyValue(row, f, q1, v); KeyValue kv1 = new KeyValue(row, f, q1, v);
kv1.setMvccVersion(w.getWriteNumber()); kv1.setMvccVersion(w.getWriteNumber());
@ -250,7 +252,7 @@ public class TestDefaultMemStore extends TestCase {
s = this.memstore.getScanners(mvcc.memstoreReadPoint()).get(0); s = this.memstore.getScanners(mvcc.memstoreReadPoint()).get(0);
assertScannerResults(s, new KeyValue[]{kv1}); assertScannerResults(s, new KeyValue[]{kv1});
w = mvcc.beginMemstoreInsert(); w = mvcc.beginMemstoreInsertWithSeqNum(this.startSeqNum.incrementAndGet());
KeyValue kv2 = new KeyValue(row, f, q2, v); KeyValue kv2 = new KeyValue(row, f, q2, v);
kv2.setMvccVersion(w.getWriteNumber()); kv2.setMvccVersion(w.getWriteNumber());
memstore.add(kv2); memstore.add(kv2);
@ -280,7 +282,7 @@ public class TestDefaultMemStore extends TestCase {
// INSERT 1: Write both columns val1 // INSERT 1: Write both columns val1
MultiVersionConsistencyControl.WriteEntry w = MultiVersionConsistencyControl.WriteEntry w =
mvcc.beginMemstoreInsert(); mvcc.beginMemstoreInsertWithSeqNum(this.startSeqNum.incrementAndGet());
KeyValue kv11 = new KeyValue(row, f, q1, v1); KeyValue kv11 = new KeyValue(row, f, q1, v1);
kv11.setMvccVersion(w.getWriteNumber()); kv11.setMvccVersion(w.getWriteNumber());
@ -296,7 +298,7 @@ public class TestDefaultMemStore extends TestCase {
assertScannerResults(s, new KeyValue[]{kv11, kv12}); assertScannerResults(s, new KeyValue[]{kv11, kv12});
// START INSERT 2: Write both columns val2 // START INSERT 2: Write both columns val2
w = mvcc.beginMemstoreInsert(); w = mvcc.beginMemstoreInsertWithSeqNum(this.startSeqNum.incrementAndGet());
KeyValue kv21 = new KeyValue(row, f, q1, v2); KeyValue kv21 = new KeyValue(row, f, q1, v2);
kv21.setMvccVersion(w.getWriteNumber()); kv21.setMvccVersion(w.getWriteNumber());
memstore.add(kv21); memstore.add(kv21);
@ -332,7 +334,7 @@ public class TestDefaultMemStore extends TestCase {
final byte[] v1 = Bytes.toBytes("value1"); final byte[] v1 = Bytes.toBytes("value1");
// INSERT 1: Write both columns val1 // INSERT 1: Write both columns val1
MultiVersionConsistencyControl.WriteEntry w = MultiVersionConsistencyControl.WriteEntry w =
mvcc.beginMemstoreInsert(); mvcc.beginMemstoreInsertWithSeqNum(this.startSeqNum.incrementAndGet());
KeyValue kv11 = new KeyValue(row, f, q1, v1); KeyValue kv11 = new KeyValue(row, f, q1, v1);
kv11.setMvccVersion(w.getWriteNumber()); kv11.setMvccVersion(w.getWriteNumber());
@ -348,7 +350,7 @@ public class TestDefaultMemStore extends TestCase {
assertScannerResults(s, new KeyValue[]{kv11, kv12}); assertScannerResults(s, new KeyValue[]{kv11, kv12});
// START DELETE: Insert delete for one of the columns // START DELETE: Insert delete for one of the columns
w = mvcc.beginMemstoreInsert(); w = mvcc.beginMemstoreInsertWithSeqNum(this.startSeqNum.incrementAndGet());
KeyValue kvDel = new KeyValue(row, f, q2, kv11.getTimestamp(), KeyValue kvDel = new KeyValue(row, f, q2, kv11.getTimestamp(),
KeyValue.Type.DeleteColumn); KeyValue.Type.DeleteColumn);
kvDel.setMvccVersion(w.getWriteNumber()); kvDel.setMvccVersion(w.getWriteNumber());
@ -377,6 +379,7 @@ public class TestDefaultMemStore extends TestCase {
final MultiVersionConsistencyControl mvcc; final MultiVersionConsistencyControl mvcc;
final MemStore memstore; final MemStore memstore;
final AtomicLong startSeqNum;
AtomicReference<Throwable> caughtException; AtomicReference<Throwable> caughtException;
@ -384,12 +387,14 @@ public class TestDefaultMemStore extends TestCase {
public ReadOwnWritesTester(int id, public ReadOwnWritesTester(int id,
MemStore memstore, MemStore memstore,
MultiVersionConsistencyControl mvcc, MultiVersionConsistencyControl mvcc,
AtomicReference<Throwable> caughtException) AtomicReference<Throwable> caughtException,
AtomicLong startSeqNum)
{ {
this.mvcc = mvcc; this.mvcc = mvcc;
this.memstore = memstore; this.memstore = memstore;
this.caughtException = caughtException; this.caughtException = caughtException;
row = Bytes.toBytes(id); row = Bytes.toBytes(id);
this.startSeqNum = startSeqNum;
} }
public void run() { public void run() {
@ -403,7 +408,7 @@ public class TestDefaultMemStore extends TestCase {
private void internalRun() throws IOException { private void internalRun() throws IOException {
for (long i = 0; i < NUM_TRIES && caughtException.get() == null; i++) { for (long i = 0; i < NUM_TRIES && caughtException.get() == null; i++) {
MultiVersionConsistencyControl.WriteEntry w = MultiVersionConsistencyControl.WriteEntry w =
mvcc.beginMemstoreInsert(); mvcc.beginMemstoreInsertWithSeqNum(this.startSeqNum.incrementAndGet());
// Insert the sequence value (i) // Insert the sequence value (i)
byte[] v = Bytes.toBytes(i); byte[] v = Bytes.toBytes(i);
@ -433,7 +438,7 @@ public class TestDefaultMemStore extends TestCase {
AtomicReference<Throwable> caught = new AtomicReference<Throwable>(); AtomicReference<Throwable> caught = new AtomicReference<Throwable>();
for (int i = 0; i < NUM_THREADS; i++) { for (int i = 0; i < NUM_THREADS; i++) {
threads[i] = new ReadOwnWritesTester(i, memstore, mvcc, caught); threads[i] = new ReadOwnWritesTester(i, memstore, mvcc, caught, this.startSeqNum);
threads[i].start(); threads[i].start();
} }

View File

@ -4152,15 +4152,16 @@ public class TestHRegion {
durabilityTest(method, Durability.ASYNC_WAL, Durability.USE_DEFAULT, 5000, true, false, true); durabilityTest(method, Durability.ASYNC_WAL, Durability.USE_DEFAULT, 5000, true, false, true);
// expect skip wal cases // expect skip wal cases
durabilityTest(method, Durability.SYNC_WAL, Durability.SKIP_WAL, 0, false, false, false); durabilityTest(method, Durability.SYNC_WAL, Durability.SKIP_WAL, 0, true, false, false);
durabilityTest(method, Durability.FSYNC_WAL, Durability.SKIP_WAL, 0, false, false, false); durabilityTest(method, Durability.FSYNC_WAL, Durability.SKIP_WAL, 0, true, false, false);
durabilityTest(method, Durability.ASYNC_WAL, Durability.SKIP_WAL, 0, false, false, false); durabilityTest(method, Durability.ASYNC_WAL, Durability.SKIP_WAL, 0, true, false, false);
durabilityTest(method, Durability.SKIP_WAL, Durability.SKIP_WAL, 0, false, false, false); durabilityTest(method, Durability.SKIP_WAL, Durability.SKIP_WAL, 0, true, false, false);
durabilityTest(method, Durability.USE_DEFAULT, Durability.SKIP_WAL, 0, false, false, false); durabilityTest(method, Durability.USE_DEFAULT, Durability.SKIP_WAL, 0, true, false, false);
durabilityTest(method, Durability.SKIP_WAL, Durability.USE_DEFAULT, 0, false, false, false); durabilityTest(method, Durability.SKIP_WAL, Durability.USE_DEFAULT, 0, true, false, false);
} }
@SuppressWarnings("unchecked")
private void durabilityTest(String method, Durability tableDurability, private void durabilityTest(String method, Durability tableDurability,
Durability mutationDurability, long timeout, boolean expectAppend, final boolean expectSync, Durability mutationDurability, long timeout, boolean expectAppend, final boolean expectSync,
final boolean expectSyncFromLogSyncer) throws Exception { final boolean expectSyncFromLogSyncer) throws Exception {
@ -4183,7 +4184,7 @@ public class TestHRegion {
//verify append called or not //verify append called or not
verify(log, expectAppend ? times(1) : never()) verify(log, expectAppend ? times(1) : never())
.appendNoSync((HTableDescriptor)any(), (HRegionInfo)any(), (HLogKey)any(), .appendNoSync((HTableDescriptor)any(), (HRegionInfo)any(), (HLogKey)any(),
(WALEdit)any(), (AtomicLong)any(), Mockito.anyBoolean()); (WALEdit)any(), (AtomicLong)any(), Mockito.anyBoolean(), (List<KeyValue>)any());
// verify sync called or not // verify sync called or not
if (expectSync || expectSyncFromLogSyncer) { if (expectSync || expectSyncFromLogSyncer) {
@ -4202,7 +4203,7 @@ public class TestHRegion {
} }
}); });
} else { } else {
verify(log, never()).sync(anyLong()); //verify(log, never()).sync(anyLong());
verify(log, never()).sync(); verify(log, never()).sync();
} }

View File

@ -46,8 +46,10 @@ public class TestMultiVersionConsistencyControl extends TestCase {
public boolean failed = false; public boolean failed = false;
public void run() { public void run() {
AtomicLong startPoint = new AtomicLong();
while (!finished.get()) { while (!finished.get()) {
MultiVersionConsistencyControl.WriteEntry e = mvcc.beginMemstoreInsert(); MultiVersionConsistencyControl.WriteEntry e =
mvcc.beginMemstoreInsertWithSeqNum(startPoint.incrementAndGet());
// System.out.println("Begin write: " + e.getWriteNumber()); // System.out.println("Begin write: " + e.getWriteNumber());
// 10 usec - 500usec (including 0) // 10 usec - 500usec (including 0)
int sleepTime = rnd.nextInt(500); int sleepTime = rnd.nextInt(500);

View File

@ -208,7 +208,7 @@ public class TestStore {
long size = store.memstore.getFlushableSize(); long size = store.memstore.getFlushableSize();
Assert.assertEquals(0, size); Assert.assertEquals(0, size);
LOG.info("Adding some data"); LOG.info("Adding some data");
long kvSize = store.add(new KeyValue(row, family, qf1, 1, (byte[])null)); long kvSize = store.add(new KeyValue(row, family, qf1, 1, (byte[])null)).getFirst();
size = store.memstore.getFlushableSize(); size = store.memstore.getFlushableSize();
Assert.assertEquals(kvSize, size); Assert.assertEquals(kvSize, size);
// Flush. Bug #1 from HBASE-10466. Make sure size calculation on failed flush is right. // Flush. Bug #1 from HBASE-10466. Make sure size calculation on failed flush is right.
@ -604,19 +604,19 @@ public class TestStore {
size += this.store.add(new KeyValue(Bytes.toBytes("200909091000"), family, qf1, size += this.store.add(new KeyValue(Bytes.toBytes("200909091000"), family, qf1,
System.currentTimeMillis(), System.currentTimeMillis(),
Bytes.toBytes(newValue))); Bytes.toBytes(newValue))).getFirst();
size += this.store.add(new KeyValue(Bytes.toBytes("200909091200"), family, qf1, size += this.store.add(new KeyValue(Bytes.toBytes("200909091200"), family, qf1,
System.currentTimeMillis(), System.currentTimeMillis(),
Bytes.toBytes(newValue))); Bytes.toBytes(newValue))).getFirst();
size += this.store.add(new KeyValue(Bytes.toBytes("200909091300"), family, qf1, size += this.store.add(new KeyValue(Bytes.toBytes("200909091300"), family, qf1,
System.currentTimeMillis(), System.currentTimeMillis(),
Bytes.toBytes(newValue))); Bytes.toBytes(newValue))).getFirst();
size += this.store.add(new KeyValue(Bytes.toBytes("200909091400"), family, qf1, size += this.store.add(new KeyValue(Bytes.toBytes("200909091400"), family, qf1,
System.currentTimeMillis(), System.currentTimeMillis(),
Bytes.toBytes(newValue))); Bytes.toBytes(newValue))).getFirst();
size += this.store.add(new KeyValue(Bytes.toBytes("200909091500"), family, qf1, size += this.store.add(new KeyValue(Bytes.toBytes("200909091500"), family, qf1,
System.currentTimeMillis(), System.currentTimeMillis(),
Bytes.toBytes(newValue))); Bytes.toBytes(newValue))).getFirst();
for ( int i = 0 ; i < 10000 ; ++i) { for ( int i = 0 ; i < 10000 ; ++i) {