HBASE-4528 The put operation can release the rowlock before sync-ing the Hlog (dhruba via jgray)
git-svn-id: https://svn.apache.org/repos/asf/hbase/trunk@1190606 13f79535-47bb-0310-9956-ffa450edef68
This commit is contained in:
parent
9d172f9b96
commit
804e99e4d0
|
@ -4,6 +4,8 @@ Release 0.93.0 - Unreleased
|
|||
HBASE-4460 Support running an embedded ThriftServer within a RegionServer (jgray)
|
||||
HBASE-4536 Allow CF to retain deleted rows (Lars H)
|
||||
HBASE-4629 Enable automated patch testing for hbase (Giridharan Kesavan)
|
||||
HBASE-4528 The put operation can release the rowlock before sync-ing the
|
||||
Hlog (dhruba via jgray)
|
||||
|
||||
IMPROVEMENT
|
||||
HBASE-4132 Extend the WALActionsListener API to accomodate log archival
|
||||
|
|
|
@ -1209,6 +1209,7 @@ public class HRegion implements HeapSize { // , Writable{
|
|||
// during the flush
|
||||
long sequenceId = -1L;
|
||||
long completeSequenceId = -1L;
|
||||
ReadWriteConsistencyControl.WriteEntry w = null;
|
||||
|
||||
// We have to take a write lock during snapshot, or else a write could
|
||||
// end up in both snapshot and memstore (makes it difficult to do atomic
|
||||
|
@ -1219,6 +1220,10 @@ public class HRegion implements HeapSize { // , Writable{
|
|||
final long currentMemStoreSize = this.memstoreSize.get();
|
||||
List<StoreFlusher> storeFlushers = new ArrayList<StoreFlusher>(stores.size());
|
||||
try {
|
||||
// Record the rwcc for all transactions in progress.
|
||||
w = rwcc.beginMemstoreInsert();
|
||||
rwcc.advanceMemstore(w);
|
||||
|
||||
sequenceId = (wal == null)? myseqid :
|
||||
wal.startCacheFlush(this.regionInfo.getEncodedNameAsBytes());
|
||||
completeSequenceId = this.getCompleteCacheFlushSequenceId(sequenceId);
|
||||
|
@ -1234,8 +1239,17 @@ public class HRegion implements HeapSize { // , Writable{
|
|||
} finally {
|
||||
this.updatesLock.writeLock().unlock();
|
||||
}
|
||||
status.setStatus("Flushing stores");
|
||||
status.setStatus("Waiting for rwcc");
|
||||
LOG.debug("Finished snapshotting, commencing waiting for rwcc");
|
||||
|
||||
// wait for all in-progress transactions to commit to HLog before
|
||||
// we can start the flush. This prevents
|
||||
// uncommitted transactions from being written into HFiles.
|
||||
// We have to block before we start the flush, otherwise keys that
|
||||
// were removed via a rollbackMemstore could be written to Hfiles.
|
||||
rwcc.waitForRead(w);
|
||||
|
||||
status.setStatus("Flushing stores");
|
||||
LOG.debug("Finished snapshotting, commencing flushing stores");
|
||||
|
||||
// Any failure from here on out will be catastrophic requiring server
|
||||
|
@ -1246,15 +1260,17 @@ public class HRegion implements HeapSize { // , Writable{
|
|||
try {
|
||||
// A. Flush memstore to all the HStores.
|
||||
// Keep running vector of all store files that includes both old and the
|
||||
// just-made new flush store file.
|
||||
// just-made new flush store file. The new flushed file is still in the
|
||||
// tmp directory.
|
||||
|
||||
for (StoreFlusher flusher : storeFlushers) {
|
||||
flusher.flushCache(status);
|
||||
}
|
||||
|
||||
// Switch snapshot (in memstore) -> new hfile (thus causing
|
||||
// all the store scanners to reset/reseek).
|
||||
for (StoreFlusher flusher : storeFlushers) {
|
||||
boolean needsCompaction = flusher.commit();
|
||||
boolean needsCompaction = flusher.commit(status);
|
||||
if (needsCompaction) {
|
||||
compactionRequested = true;
|
||||
}
|
||||
|
@ -1483,11 +1499,12 @@ public class HRegion implements HeapSize { // , Writable{
|
|||
}
|
||||
|
||||
/**
|
||||
* This is used only by unit tests. Not required to be a public API.
|
||||
* @param familyMap map of family to edits for the given family.
|
||||
* @param writeToWAL
|
||||
* @throws IOException
|
||||
*/
|
||||
public void delete(Map<byte[], List<KeyValue>> familyMap, UUID clusterId,
|
||||
void delete(Map<byte[], List<KeyValue>> familyMap, UUID clusterId,
|
||||
boolean writeToWAL) throws IOException {
|
||||
Delete delete = new Delete();
|
||||
delete.setFamilyMap(familyMap);
|
||||
|
@ -1577,7 +1594,7 @@ public class HRegion implements HeapSize { // , Writable{
|
|||
}
|
||||
|
||||
// Now make changes to the memstore.
|
||||
long addedSize = applyFamilyMapToMemstore(familyMap);
|
||||
long addedSize = applyFamilyMapToMemstore(familyMap, null);
|
||||
flush = isFlushSize(this.addAndGetGlobalMemstoreSize(addedSize));
|
||||
|
||||
if (coprocessorHost != null) {
|
||||
|
@ -1745,8 +1762,9 @@ public class HRegion implements HeapSize { // , Writable{
|
|||
}
|
||||
}
|
||||
|
||||
long now = EnvironmentEdgeManager.currentTimeMillis();
|
||||
byte[] byteNow = Bytes.toBytes(now);
|
||||
ReadWriteConsistencyControl.WriteEntry w = null;
|
||||
long txid = 0;
|
||||
boolean walSyncSuccessful = false;
|
||||
boolean locked = false;
|
||||
|
||||
/** Keep track of the locks we hold so we can release them in finally clause */
|
||||
|
@ -1805,6 +1823,12 @@ public class HRegion implements HeapSize { // , Writable{
|
|||
lastIndexExclusive++;
|
||||
numReadyToWrite++;
|
||||
}
|
||||
|
||||
// we should record the timestamp only after we have acquired the rowLock,
|
||||
// otherwise, newer puts are not guaranteed to have a newer timestamp
|
||||
long now = EnvironmentEdgeManager.currentTimeMillis();
|
||||
byte[] byteNow = Bytes.toBytes(now);
|
||||
|
||||
// Nothing to put -- an exception in the above such as NoSuchColumnFamily?
|
||||
if (numReadyToWrite <= 0) return 0L;
|
||||
|
||||
|
@ -1823,32 +1847,23 @@ public class HRegion implements HeapSize { // , Writable{
|
|||
byteNow);
|
||||
}
|
||||
|
||||
|
||||
this.updatesLock.readLock().lock();
|
||||
locked = true;
|
||||
|
||||
//
|
||||
// ------------------------------------
|
||||
// STEP 3. Write to WAL
|
||||
// Acquire the latest rwcc number
|
||||
// ----------------------------------
|
||||
for (int i = firstIndex; i < lastIndexExclusive; i++) {
|
||||
// Skip puts that were determined to be invalid during preprocessing
|
||||
if (batchOp.retCodeDetails[i].getOperationStatusCode()
|
||||
!= OperationStatusCode.NOT_RUN) {
|
||||
continue;
|
||||
}
|
||||
|
||||
Put p = batchOp.operations[i].getFirst();
|
||||
if (!p.getWriteToWAL()) continue;
|
||||
addFamilyMapToWALEdit(familyMaps[i], walEdit);
|
||||
}
|
||||
|
||||
// Append the edit to WAL
|
||||
Put first = batchOp.operations[firstIndex].getFirst();
|
||||
this.log.append(regionInfo, this.htableDescriptor.getName(),
|
||||
walEdit, first.getClusterId(), now, this.htableDescriptor);
|
||||
w = rwcc.beginMemstoreInsert();
|
||||
|
||||
// ------------------------------------
|
||||
// STEP 4. Write back to memstore
|
||||
// STEP 3. Write back to memstore
|
||||
// Write to memstore. It is ok to write to memstore
|
||||
// first without updating the HLog because we do not roll
|
||||
// forward the memstore RWCC. The RWCC will be moved up when
|
||||
// the complete operation is done. These changes are not yet
|
||||
// visible to scanners till we update the RWCC. The RWCC is
|
||||
// moved only when the sync is complete.
|
||||
// ----------------------------------
|
||||
long addedSize = 0;
|
||||
for (int i = firstIndex; i < lastIndexExclusive; i++) {
|
||||
|
@ -1856,13 +1871,65 @@ public class HRegion implements HeapSize { // , Writable{
|
|||
!= OperationStatusCode.NOT_RUN) {
|
||||
continue;
|
||||
}
|
||||
addedSize += applyFamilyMapToMemstore(familyMaps[i]);
|
||||
batchOp.retCodeDetails[i] = new OperationStatus(
|
||||
OperationStatusCode.SUCCESS);
|
||||
addedSize += applyFamilyMapToMemstore(familyMaps[i], w);
|
||||
}
|
||||
|
||||
// ------------------------------------
|
||||
// STEP 5. Run coprocessor post hooks
|
||||
// STEP 4. Build WAL edit
|
||||
// ----------------------------------
|
||||
for (int i = firstIndex; i < lastIndexExclusive; i++) {
|
||||
// Skip puts that were determined to be invalid during preprocessing
|
||||
if (batchOp.retCodeDetails[i].getOperationStatusCode()
|
||||
!= OperationStatusCode.NOT_RUN) {
|
||||
continue;
|
||||
}
|
||||
batchOp.retCodeDetails[i] = new OperationStatus(OperationStatusCode.SUCCESS);
|
||||
|
||||
Put p = batchOp.operations[i].getFirst();
|
||||
if (!p.getWriteToWAL()) continue;
|
||||
addFamilyMapToWALEdit(familyMaps[i], walEdit);
|
||||
}
|
||||
|
||||
// -------------------------
|
||||
// STEP 5. Append the edit to WAL. Do not sync wal.
|
||||
// -------------------------
|
||||
Put first = batchOp.operations[firstIndex].getFirst();
|
||||
txid = this.log.appendNoSync(regionInfo, this.htableDescriptor.getName(),
|
||||
walEdit, first.getClusterId(), now, this.htableDescriptor);
|
||||
|
||||
// -------------------------------
|
||||
// STEP 6. Release row locks, etc.
|
||||
// -------------------------------
|
||||
if (locked) {
|
||||
this.updatesLock.readLock().unlock();
|
||||
locked = false;
|
||||
}
|
||||
if (acquiredLocks != null) {
|
||||
for (Integer toRelease : acquiredLocks) {
|
||||
releaseRowLock(toRelease);
|
||||
}
|
||||
acquiredLocks = null;
|
||||
}
|
||||
// -------------------------
|
||||
// STEP 7. Sync wal.
|
||||
// -------------------------
|
||||
if (walEdit.size() > 0 &&
|
||||
(this.regionInfo.isMetaRegion() ||
|
||||
!this.htableDescriptor.isDeferredLogFlush())) {
|
||||
this.log.sync(txid);
|
||||
}
|
||||
walSyncSuccessful = true;
|
||||
// ------------------------------------------------------------------
|
||||
// STEP 8. Advance rwcc. This will make this put visible to scanners and getters.
|
||||
// ------------------------------------------------------------------
|
||||
if (w != null) {
|
||||
rwcc.completeMemstoreInsert(w);
|
||||
w = null;
|
||||
}
|
||||
|
||||
// ------------------------------------
|
||||
// STEP 9. Run coprocessor post hooks. This should be done after the wal is
|
||||
// sycned so that the coprocessor contract is adhered to.
|
||||
// ------------------------------------
|
||||
if (coprocessorHost != null) {
|
||||
for (int i = firstIndex; i < lastIndexExclusive; i++) {
|
||||
|
@ -1879,11 +1946,21 @@ public class HRegion implements HeapSize { // , Writable{
|
|||
success = true;
|
||||
return addedSize;
|
||||
} finally {
|
||||
if (locked)
|
||||
this.updatesLock.readLock().unlock();
|
||||
|
||||
for (Integer toRelease : acquiredLocks) {
|
||||
releaseRowLock(toRelease);
|
||||
// if the wal sync was unsuccessful, remove keys from memstore
|
||||
if (!walSyncSuccessful) {
|
||||
rollbackMemstore(batchOp, familyMaps, firstIndex, lastIndexExclusive);
|
||||
}
|
||||
if (w != null) rwcc.completeMemstoreInsert(w);
|
||||
|
||||
if (locked) {
|
||||
this.updatesLock.readLock().unlock();
|
||||
}
|
||||
|
||||
if (acquiredLocks != null) {
|
||||
for (Integer toRelease : acquiredLocks) {
|
||||
releaseRowLock(toRelease);
|
||||
}
|
||||
}
|
||||
if (!success) {
|
||||
for (int i = firstIndex; i < lastIndexExclusive; i++) {
|
||||
|
@ -2121,7 +2198,7 @@ public class HRegion implements HeapSize { // , Writable{
|
|||
walEdit, clusterId, now, this.htableDescriptor);
|
||||
}
|
||||
|
||||
long addedSize = applyFamilyMapToMemstore(familyMap);
|
||||
long addedSize = applyFamilyMapToMemstore(familyMap, null);
|
||||
flush = isFlushSize(this.addAndGetGlobalMemstoreSize(addedSize));
|
||||
} finally {
|
||||
this.updatesLock.readLock().unlock();
|
||||
|
@ -2143,14 +2220,22 @@ public class HRegion implements HeapSize { // , Writable{
|
|||
* should already have locked updatesLock.readLock(). This also does
|
||||
* <b>not</b> check the families for validity.
|
||||
*
|
||||
* @param familyMap Map of kvs per family
|
||||
* @param localizedWriteEntry The WriteEntry of the RWCC for this transaction.
|
||||
* If null, then this method internally creates a rwcc transaction.
|
||||
* @return the additional memory usage of the memstore caused by the
|
||||
* new entries.
|
||||
*/
|
||||
private long applyFamilyMapToMemstore(Map<byte[], List<KeyValue>> familyMap) {
|
||||
ReadWriteConsistencyControl.WriteEntry w = null;
|
||||
private long applyFamilyMapToMemstore(Map<byte[], List<KeyValue>> familyMap,
|
||||
ReadWriteConsistencyControl.WriteEntry localizedWriteEntry) {
|
||||
long size = 0;
|
||||
boolean freerwcc = false;
|
||||
|
||||
try {
|
||||
w = rwcc.beginMemstoreInsert();
|
||||
if (localizedWriteEntry == null) {
|
||||
localizedWriteEntry = rwcc.beginMemstoreInsert();
|
||||
freerwcc = true;
|
||||
}
|
||||
|
||||
for (Map.Entry<byte[], List<KeyValue>> e : familyMap.entrySet()) {
|
||||
byte[] family = e.getKey();
|
||||
|
@ -2158,16 +2243,54 @@ public class HRegion implements HeapSize { // , Writable{
|
|||
|
||||
Store store = getStore(family);
|
||||
for (KeyValue kv: edits) {
|
||||
kv.setMemstoreTS(w.getWriteNumber());
|
||||
kv.setMemstoreTS(localizedWriteEntry.getWriteNumber());
|
||||
size += store.add(kv);
|
||||
}
|
||||
}
|
||||
} finally {
|
||||
rwcc.completeMemstoreInsert(w);
|
||||
if (freerwcc) {
|
||||
rwcc.completeMemstoreInsert(localizedWriteEntry);
|
||||
}
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove all the keys listed in the map from the memstore. This method is
|
||||
* called when a Put has updated memstore but subequently fails to update
|
||||
* the wal. This method is then invoked to rollback the memstore.
|
||||
*/
|
||||
private void rollbackMemstore(BatchOperationInProgress<Pair<Put, Integer>> batchOp,
|
||||
Map<byte[], List<KeyValue>>[] familyMaps,
|
||||
int start, int end) {
|
||||
int kvsRolledback = 0;
|
||||
for (int i = start; i < end; i++) {
|
||||
// skip over request that never succeeded in the first place.
|
||||
if (batchOp.retCodeDetails[i].getOperationStatusCode()
|
||||
!= OperationStatusCode.SUCCESS) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Rollback all the kvs for this row.
|
||||
Map<byte[], List<KeyValue>> familyMap = familyMaps[i];
|
||||
for (Map.Entry<byte[], List<KeyValue>> e : familyMap.entrySet()) {
|
||||
byte[] family = e.getKey();
|
||||
List<KeyValue> edits = e.getValue();
|
||||
|
||||
// Remove those keys from the memstore that matches our
|
||||
// key's (row, cf, cq, timestamp, memstoreTS). The interesting part is
|
||||
// that even the memstoreTS has to match for keys that will be rolleded-back.
|
||||
Store store = getStore(family);
|
||||
for (KeyValue kv: edits) {
|
||||
store.rollback(kv);
|
||||
kvsRolledback++;
|
||||
}
|
||||
}
|
||||
}
|
||||
LOG.debug("rollbackMemstore rolled back " + kvsRolledback +
|
||||
" keyvalues from start:" + start + " to end:" + end);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check the collection of families for validity.
|
||||
* @throws NoSuchColumnFamilyException if a family does not exist.
|
||||
|
|
|
@ -165,6 +165,10 @@ class KeyValueSkipListSet implements NavigableSet<KeyValue> {
|
|||
throw new UnsupportedOperationException("Not implemented");
|
||||
}
|
||||
|
||||
public KeyValue get(KeyValue kv) {
|
||||
return this.delegatee.get(kv);
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return this.delegatee.size();
|
||||
}
|
||||
|
@ -176,4 +180,4 @@ class KeyValueSkipListSet implements NavigableSet<KeyValue> {
|
|||
public <T> T[] toArray(T[] a) {
|
||||
throw new UnsupportedOperationException("Not implemented");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -249,6 +249,38 @@ public class MemStore implements HeapSize {
|
|||
return newKv;
|
||||
}
|
||||
|
||||
/**
|
||||
* Remove n key from the memstore. Only kvs that have the same key and the
|
||||
* same memstoreTS are removed. It is ok to not update timeRangeTracker
|
||||
* in this call. It is possible that we can optimize this method by using
|
||||
* tailMap/iterator, but since this method is called rarely (only for
|
||||
* error recovery), we can leave those optimization for the future.
|
||||
* @param kv
|
||||
*/
|
||||
void rollback(final KeyValue kv) {
|
||||
this.lock.readLock().lock();
|
||||
try {
|
||||
// If the key is in the snapshot, delete it. We should not update
|
||||
// this.size, because that tracks the size of only the memstore and
|
||||
// not the snapshot. The flush of this snapshot to disk has not
|
||||
// yet started because Store.flush() waits for all rwcc transactions to
|
||||
// commit before starting the flush to disk.
|
||||
KeyValue found = this.snapshot.get(kv);
|
||||
if (found != null && found.getMemstoreTS() == kv.getMemstoreTS()) {
|
||||
this.snapshot.remove(kv);
|
||||
}
|
||||
// If the key is in the memstore, delete it. Update this.size.
|
||||
found = this.kvset.get(kv);
|
||||
if (found != null && found.getMemstoreTS() == kv.getMemstoreTS()) {
|
||||
this.kvset.remove(kv);
|
||||
long s = heapSizeChange(kv, true);
|
||||
this.size.addAndGet(-s);
|
||||
}
|
||||
} finally {
|
||||
this.lock.readLock().unlock();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a delete
|
||||
* @param delete
|
||||
|
|
|
@ -87,6 +87,11 @@ public class ReadWriteConsistencyControl {
|
|||
}
|
||||
|
||||
public void completeMemstoreInsert(WriteEntry e) {
|
||||
advanceMemstore(e);
|
||||
waitForRead(e);
|
||||
}
|
||||
|
||||
boolean advanceMemstore(WriteEntry e) {
|
||||
synchronized (writeQueue) {
|
||||
e.markCompleted();
|
||||
|
||||
|
@ -120,10 +125,19 @@ public class ReadWriteConsistencyControl {
|
|||
memstoreRead = nextReadValue;
|
||||
readWaiters.notifyAll();
|
||||
}
|
||||
|
||||
}
|
||||
if (memstoreRead >= e.getWriteNumber()) {
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Wait for the global readPoint to advance upto
|
||||
* the specified transaction number.
|
||||
*/
|
||||
public void waitForRead(WriteEntry e) {
|
||||
boolean interrupted = false;
|
||||
synchronized (readWaiters) {
|
||||
while (memstoreRead < e.getWriteNumber()) {
|
||||
|
|
|
@ -27,6 +27,7 @@ import java.util.Collections;
|
|||
import java.util.List;
|
||||
import java.util.NavigableSet;
|
||||
import java.util.SortedSet;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.concurrent.CopyOnWriteArraySet;
|
||||
import java.util.concurrent.locks.ReentrantReadWriteLock;
|
||||
|
||||
|
@ -321,6 +322,22 @@ public class Store implements HeapSize {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Removes a kv from the memstore. The KeyValue is removed only
|
||||
* if its key & memstoreTS matches the key & memstoreTS value of the
|
||||
* kv parameter.
|
||||
*
|
||||
* @param kv
|
||||
*/
|
||||
protected void rollback(final KeyValue kv) {
|
||||
lock.readLock().lock();
|
||||
try {
|
||||
this.memstore.rollback(kv);
|
||||
} finally {
|
||||
lock.readLock().unlock();
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* @return All store files.
|
||||
*/
|
||||
|
@ -447,34 +464,41 @@ public class Store implements HeapSize {
|
|||
* @param logCacheFlushId flush sequence number
|
||||
* @param snapshot
|
||||
* @param snapshotTimeRangeTracker
|
||||
* @return true if a compaction is needed
|
||||
* @param flushedSize The number of bytes flushed
|
||||
* @param status
|
||||
* @return Path The path name of the tmp file to which the store was flushed
|
||||
* @throws IOException
|
||||
*/
|
||||
private StoreFile flushCache(final long logCacheFlushId,
|
||||
private Path flushCache(final long logCacheFlushId,
|
||||
SortedSet<KeyValue> snapshot,
|
||||
TimeRangeTracker snapshotTimeRangeTracker,
|
||||
AtomicLong flushedSize,
|
||||
MonitoredTask status) throws IOException {
|
||||
// If an exception happens flushing, we let it out without clearing
|
||||
// the memstore snapshot. The old snapshot will be returned when we say
|
||||
// 'snapshot', the next time flush comes around.
|
||||
return internalFlushCache(
|
||||
snapshot, logCacheFlushId, snapshotTimeRangeTracker, status);
|
||||
snapshot, logCacheFlushId, snapshotTimeRangeTracker, flushedSize, status);
|
||||
}
|
||||
|
||||
/*
|
||||
* @param cache
|
||||
* @param logCacheFlushId
|
||||
* @return StoreFile created.
|
||||
* @param snapshotTimeRangeTracker
|
||||
* @param flushedSize The number of bytes flushed
|
||||
* @return Path The path name of the tmp file to which the store was flushed
|
||||
* @throws IOException
|
||||
*/
|
||||
private StoreFile internalFlushCache(final SortedSet<KeyValue> set,
|
||||
private Path internalFlushCache(final SortedSet<KeyValue> set,
|
||||
final long logCacheFlushId,
|
||||
TimeRangeTracker snapshotTimeRangeTracker,
|
||||
AtomicLong flushedSize,
|
||||
MonitoredTask status)
|
||||
throws IOException {
|
||||
StoreFile.Writer writer;
|
||||
String fileName;
|
||||
long flushed = 0;
|
||||
Path pathName;
|
||||
// Don't flush if there are no entries.
|
||||
if (set.size() == 0) {
|
||||
return null;
|
||||
|
@ -496,7 +520,7 @@ public class Store implements HeapSize {
|
|||
// A. Write the map out to the disk
|
||||
writer = createWriterInTmp(set.size());
|
||||
writer.setTimeRangeTracker(snapshotTimeRangeTracker);
|
||||
fileName = writer.getPath().getName();
|
||||
pathName = writer.getPath();
|
||||
try {
|
||||
List<KeyValue> kvs = new ArrayList<KeyValue>();
|
||||
boolean hasMore;
|
||||
|
@ -520,17 +544,39 @@ public class Store implements HeapSize {
|
|||
}
|
||||
}
|
||||
} finally {
|
||||
flushedSize.set(flushed);
|
||||
scanner.close();
|
||||
}
|
||||
if (LOG.isInfoEnabled()) {
|
||||
LOG.info("Flushed " +
|
||||
", sequenceid=" + logCacheFlushId +
|
||||
", memsize=" + StringUtils.humanReadableInt(flushed) +
|
||||
", into tmp file " + pathName);
|
||||
}
|
||||
return pathName;
|
||||
}
|
||||
|
||||
/*
|
||||
* @param path The pathname of the tmp file into which the store was flushed
|
||||
* @param logCacheFlushId
|
||||
* @return StoreFile created.
|
||||
* @throws IOException
|
||||
*/
|
||||
private StoreFile commitFile(final Path path,
|
||||
final long logCacheFlushId,
|
||||
TimeRangeTracker snapshotTimeRangeTracker,
|
||||
AtomicLong flushedSize,
|
||||
MonitoredTask status)
|
||||
throws IOException {
|
||||
// Write-out finished successfully, move into the right spot
|
||||
String fileName = path.getName();
|
||||
Path dstPath = new Path(homedir, fileName);
|
||||
validateStoreFile(writer.getPath());
|
||||
String msg = "Renaming flushed file at " + writer.getPath() + " to " + dstPath;
|
||||
validateStoreFile(path);
|
||||
String msg = "Renaming flushed file at " + path + " to " + dstPath;
|
||||
LOG.info(msg);
|
||||
status.setStatus("Flushing " + this + ": " + msg);
|
||||
if (!fs.rename(writer.getPath(), dstPath)) {
|
||||
LOG.warn("Unable to rename " + writer.getPath() + " to " + dstPath);
|
||||
if (!fs.rename(path, dstPath)) {
|
||||
LOG.warn("Unable to rename " + path + " to " + dstPath);
|
||||
}
|
||||
|
||||
status.setStatus("Flushing " + this + ": reopening flushed file");
|
||||
|
@ -546,11 +592,10 @@ public class Store implements HeapSize {
|
|||
// HRegion.internalFlushcache, which indirectly calls this to actually do
|
||||
// the flushing through the StoreFlusherImpl class
|
||||
HRegion.incrNumericPersistentMetric("cf." + this.toString() + ".flushSize",
|
||||
flushed);
|
||||
flushedSize.longValue());
|
||||
if(LOG.isInfoEnabled()) {
|
||||
LOG.info("Added " + sf + ", entries=" + r.getEntries() +
|
||||
", sequenceid=" + logCacheFlushId +
|
||||
", memsize=" + StringUtils.humanReadableInt(flushed) +
|
||||
", filesize=" + StringUtils.humanReadableInt(r.length()));
|
||||
}
|
||||
return sf;
|
||||
|
@ -1815,10 +1860,13 @@ public class Store implements HeapSize {
|
|||
private long cacheFlushId;
|
||||
private SortedSet<KeyValue> snapshot;
|
||||
private StoreFile storeFile;
|
||||
private Path storeFilePath;
|
||||
private TimeRangeTracker snapshotTimeRangeTracker;
|
||||
private AtomicLong flushedSize;
|
||||
|
||||
private StoreFlusherImpl(long cacheFlushId) {
|
||||
this.cacheFlushId = cacheFlushId;
|
||||
this.flushedSize = new AtomicLong();
|
||||
}
|
||||
|
||||
@Override
|
||||
|
@ -1830,15 +1878,17 @@ public class Store implements HeapSize {
|
|||
|
||||
@Override
|
||||
public void flushCache(MonitoredTask status) throws IOException {
|
||||
storeFile = Store.this.flushCache(
|
||||
cacheFlushId, snapshot, snapshotTimeRangeTracker, status);
|
||||
storeFilePath = Store.this.flushCache(
|
||||
cacheFlushId, snapshot, snapshotTimeRangeTracker, flushedSize, status);
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean commit() throws IOException {
|
||||
if (storeFile == null) {
|
||||
public boolean commit(MonitoredTask status) throws IOException {
|
||||
if (storeFilePath == null) {
|
||||
return false;
|
||||
}
|
||||
storeFile = Store.this.commitFile(storeFilePath, cacheFlushId,
|
||||
snapshotTimeRangeTracker, flushedSize, status);
|
||||
// Add new file to store files. Clear snapshot too while we have
|
||||
// the Store write lock.
|
||||
return Store.this.updateStorefiles(storeFile, snapshot);
|
||||
|
|
|
@ -60,5 +60,5 @@ interface StoreFlusher {
|
|||
* @return
|
||||
* @throws IOException
|
||||
*/
|
||||
boolean commit() throws IOException;
|
||||
boolean commit(MonitoredTask status) throws IOException;
|
||||
}
|
||||
|
|
|
@ -1230,7 +1230,7 @@ public class HLog implements Syncable {
|
|||
logSyncerThread.hlogFlush(this.writer);
|
||||
this.writer.sync();
|
||||
syncBatchSize.addAndGet(doneUpto - this.syncedTillHere);
|
||||
this.syncedTillHere = doneUpto;
|
||||
this.syncedTillHere = Math.max(this.syncedTillHere, doneUpto);
|
||||
} catch(IOException io) {
|
||||
syncSuccessful = false;
|
||||
}
|
||||
|
|
|
@ -0,0 +1,257 @@
|
|||
/**
|
||||
* Licensed to the Apache Software Foundation (ASF) under one
|
||||
* or more contributor license agreements. See the NOTICE file
|
||||
* distributed with this work for additional information
|
||||
* regarding copyright ownership. The ASF licenses this file
|
||||
* to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance
|
||||
* with the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
package org.apache.hadoop.hbase.regionserver;
|
||||
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Arrays;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
import java.util.Random;
|
||||
|
||||
import org.apache.commons.logging.Log;
|
||||
import org.apache.commons.logging.LogFactory;
|
||||
import org.apache.hadoop.conf.Configuration;
|
||||
import org.apache.hadoop.hdfs.MiniDFSCluster;
|
||||
import org.apache.hadoop.fs.FileSystem;
|
||||
import org.apache.hadoop.fs.Path;
|
||||
import org.apache.hadoop.hbase.DoNotRetryIOException;
|
||||
import org.apache.hadoop.hbase.HBaseConfiguration;
|
||||
import org.apache.hadoop.hbase.HBaseTestCase;
|
||||
import org.apache.hadoop.hbase.HBaseTestingUtility;
|
||||
import org.apache.hadoop.hbase.HColumnDescriptor;
|
||||
import org.apache.hadoop.hbase.HConstants;
|
||||
import org.apache.hadoop.hbase.HDFSBlocksDistribution;
|
||||
import org.apache.hadoop.hbase.ServerName;
|
||||
import org.apache.hadoop.hbase.HConstants.OperationStatusCode;
|
||||
import org.apache.hadoop.hbase.HRegionInfo;
|
||||
import org.apache.hadoop.hbase.HTableDescriptor;
|
||||
import org.apache.hadoop.hbase.KeyValue;
|
||||
import org.apache.hadoop.hbase.MultithreadedTestUtil;
|
||||
import org.apache.hadoop.hbase.MultithreadedTestUtil.TestThread;
|
||||
import org.apache.hadoop.hbase.MiniHBaseCluster;
|
||||
import org.apache.hadoop.hbase.client.Delete;
|
||||
import org.apache.hadoop.hbase.client.Get;
|
||||
import org.apache.hadoop.hbase.client.Put;
|
||||
import org.apache.hadoop.hbase.client.Result;
|
||||
import org.apache.hadoop.hbase.client.Scan;
|
||||
import org.apache.hadoop.hbase.client.HTable;
|
||||
import org.apache.hadoop.hbase.filter.BinaryComparator;
|
||||
import org.apache.hadoop.hbase.filter.ColumnCountGetFilter;
|
||||
import org.apache.hadoop.hbase.filter.CompareFilter.CompareOp;
|
||||
import org.apache.hadoop.hbase.filter.Filter;
|
||||
import org.apache.hadoop.hbase.filter.FilterList;
|
||||
import org.apache.hadoop.hbase.filter.NullComparator;
|
||||
import org.apache.hadoop.hbase.filter.PrefixFilter;
|
||||
import org.apache.hadoop.hbase.filter.SingleColumnValueFilter;
|
||||
import org.apache.hadoop.hbase.regionserver.HRegion.RegionScannerImpl;
|
||||
import org.apache.hadoop.hbase.regionserver.wal.HLog;
|
||||
import org.apache.hadoop.hbase.util.Bytes;
|
||||
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
|
||||
import org.apache.hadoop.hbase.util.EnvironmentEdgeManagerTestHelper;
|
||||
import org.apache.hadoop.hbase.util.IncrementingEnvironmentEdge;
|
||||
import org.apache.hadoop.hbase.util.ManualEnvironmentEdge;
|
||||
import org.apache.hadoop.hbase.util.Pair;
|
||||
import org.apache.hadoop.hbase.util.PairOfSameType;
|
||||
import org.apache.hadoop.hbase.util.Threads;
|
||||
import org.junit.Test;
|
||||
|
||||
import com.google.common.collect.Lists;
|
||||
|
||||
|
||||
/**
|
||||
* Testing of multiPut in parallel.
|
||||
*
|
||||
*/
|
||||
public class TestParallelPut extends HBaseTestCase {
|
||||
static final Log LOG = LogFactory.getLog(TestParallelPut.class);
|
||||
|
||||
private static HRegion region = null;
|
||||
private static HBaseTestingUtility hbtu = new HBaseTestingUtility();
|
||||
private static final String DIR = hbtu.getDataTestDir() + "/TestParallelPut/";
|
||||
|
||||
// Test names
|
||||
static final byte[] tableName = Bytes.toBytes("testtable");;
|
||||
static final byte[] qual1 = Bytes.toBytes("qual1");
|
||||
static final byte[] qual2 = Bytes.toBytes("qual2");
|
||||
static final byte[] qual3 = Bytes.toBytes("qual3");
|
||||
static final byte[] value1 = Bytes.toBytes("value1");
|
||||
static final byte[] value2 = Bytes.toBytes("value2");
|
||||
static final byte [] row = Bytes.toBytes("rowA");
|
||||
static final byte [] row2 = Bytes.toBytes("rowB");
|
||||
|
||||
/**
|
||||
* @see org.apache.hadoop.hbase.HBaseTestCase#setUp()
|
||||
*/
|
||||
@Override
|
||||
protected void setUp() throws Exception {
|
||||
super.setUp();
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void tearDown() throws Exception {
|
||||
super.tearDown();
|
||||
EnvironmentEdgeManagerTestHelper.reset();
|
||||
}
|
||||
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
// New tests that don't spin up a mini cluster but rather just test the
|
||||
// individual code pieces in the HRegion.
|
||||
//////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
/**
|
||||
* Test one put command.
|
||||
*/
|
||||
public void testPut() throws IOException {
|
||||
LOG.info("Starting testPut");
|
||||
initHRegion(tableName, getName(), fam1);
|
||||
|
||||
long value = 1L;
|
||||
|
||||
Put put = new Put(row);
|
||||
put.add(fam1, qual1, Bytes.toBytes(value));
|
||||
region.put(put);
|
||||
|
||||
assertGet(row, fam1, qual1, Bytes.toBytes(value));
|
||||
}
|
||||
|
||||
/**
|
||||
* Test multi-threaded Puts.
|
||||
*/
|
||||
public void testParallelPuts() throws IOException {
|
||||
|
||||
LOG.info("Starting testParallelPuts");
|
||||
initHRegion(tableName, getName(), fam1);
|
||||
int numOps = 1000; // these many operations per thread
|
||||
|
||||
// create 100 threads, each will do its own puts
|
||||
int numThreads = 100;
|
||||
Putter[] all = new Putter[numThreads];
|
||||
|
||||
// create all threads
|
||||
for (int i = 0; i < numThreads; i++) {
|
||||
all[i] = new Putter(region, i, numOps);
|
||||
}
|
||||
|
||||
// run all threads
|
||||
for (int i = 0; i < numThreads; i++) {
|
||||
all[i].start();
|
||||
}
|
||||
|
||||
// wait for all threads to finish
|
||||
for (int i = 0; i < numThreads; i++) {
|
||||
try {
|
||||
all[i].join();
|
||||
} catch (InterruptedException e) {
|
||||
LOG.warn("testParallelPuts encountered InterruptedException." +
|
||||
" Ignoring....", e);
|
||||
}
|
||||
}
|
||||
LOG.info("testParallelPuts successfully verified " +
|
||||
(numOps * numThreads) + " put operations.");
|
||||
}
|
||||
|
||||
|
||||
static private void assertGet(byte [] row,
|
||||
byte [] familiy,
|
||||
byte[] qualifier,
|
||||
byte[] value) throws IOException {
|
||||
// run a get and see if the value matches
|
||||
Get get = new Get(row);
|
||||
get.addColumn(familiy, qualifier);
|
||||
Result result = region.get(get, null);
|
||||
assertEquals(1, result.size());
|
||||
|
||||
KeyValue kv = result.raw()[0];
|
||||
byte[] r = kv.getValue();
|
||||
assertTrue(Bytes.compareTo(r, value) == 0);
|
||||
}
|
||||
|
||||
private void initHRegion(byte [] tableName, String callingMethod,
|
||||
byte[] ... families)
|
||||
throws IOException {
|
||||
initHRegion(tableName, callingMethod, HBaseConfiguration.create(), families);
|
||||
}
|
||||
|
||||
private void initHRegion(byte [] tableName, String callingMethod,
|
||||
Configuration conf, byte [] ... families)
|
||||
throws IOException{
|
||||
HTableDescriptor htd = new HTableDescriptor(tableName);
|
||||
for(byte [] family : families) {
|
||||
htd.addFamily(new HColumnDescriptor(family));
|
||||
}
|
||||
HRegionInfo info = new HRegionInfo(htd.getName(), null, null, false);
|
||||
Path path = new Path(DIR + callingMethod);
|
||||
if (fs.exists(path)) {
|
||||
if (!fs.delete(path, true)) {
|
||||
throw new IOException("Failed delete of " + path);
|
||||
}
|
||||
}
|
||||
region = HRegion.createHRegion(info, path, conf, htd);
|
||||
}
|
||||
|
||||
/**
|
||||
* A thread that makes a few put calls
|
||||
*/
|
||||
public static class Putter extends Thread {
|
||||
|
||||
private final HRegion region;
|
||||
private final int threadNumber;
|
||||
private final int numOps;
|
||||
private final Random rand = new Random();
|
||||
byte [] rowkey = null;
|
||||
|
||||
public Putter(HRegion region, int threadNumber, int numOps) {
|
||||
this.region = region;
|
||||
this.threadNumber = threadNumber;
|
||||
this.numOps = numOps;
|
||||
this.rowkey = Bytes.toBytes((long)threadNumber); // unique rowid per thread
|
||||
setDaemon(true);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void run() {
|
||||
byte[] value = new byte[100];
|
||||
Put[] in = new Put[1];
|
||||
|
||||
// iterate for the specified number of operations
|
||||
for (int i=0; i<numOps; i++) {
|
||||
// generate random bytes
|
||||
rand.nextBytes(value);
|
||||
|
||||
// put the randombytes and verify that we can read it. This is one
|
||||
// way of ensuring that rwcc manipulation in HRegion.put() is fine.
|
||||
Put put = new Put(rowkey);
|
||||
put.add(fam1, qual1, value);
|
||||
in[0] = put;
|
||||
try {
|
||||
OperationStatus[] ret = region.put(in);
|
||||
assertEquals(1, ret.length);
|
||||
assertEquals(OperationStatusCode.SUCCESS, ret[0].getOperationStatusCode());
|
||||
assertGet(rowkey, fam1, qual1, value);
|
||||
} catch (IOException e) {
|
||||
assertTrue("Thread id " + threadNumber + " operation " + i + " failed.",
|
||||
false);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
|
@ -618,7 +618,7 @@ public class TestStore extends TestCase {
|
|||
StoreFlusher storeFlusher = store.getStoreFlusher(id);
|
||||
storeFlusher.prepare();
|
||||
storeFlusher.flushCache(Mockito.mock(MonitoredTask.class));
|
||||
storeFlusher.commit();
|
||||
storeFlusher.commit(Mockito.mock(MonitoredTask.class));
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -49,6 +49,7 @@ import org.apache.hadoop.hbase.MiniHBaseCluster;
|
|||
import org.apache.hadoop.hbase.client.HBaseAdmin;
|
||||
import org.apache.hadoop.hbase.client.HTable;
|
||||
import org.apache.hadoop.hbase.client.Put;
|
||||
import org.apache.hadoop.hbase.client.Get;
|
||||
import org.apache.hadoop.hbase.client.Result;
|
||||
import org.apache.hadoop.hbase.client.ResultScanner;
|
||||
import org.apache.hadoop.hbase.client.Scan;
|
||||
|
@ -250,6 +251,17 @@ public class TestLogRolling {
|
|||
}
|
||||
}
|
||||
|
||||
void validateData(HTable table, int rownum) throws IOException {
|
||||
String row = "row" + String.format("%1$04d", rownum);
|
||||
Get get = new Get(Bytes.toBytes(row));
|
||||
get.addFamily(HConstants.CATALOG_FAMILY);
|
||||
Result result = table.get(get);
|
||||
assertTrue(result.size() == 1);
|
||||
assertTrue(Bytes.equals(value,
|
||||
result.getValue(HConstants.CATALOG_FAMILY, null)));
|
||||
LOG.info("Validated row " + row);
|
||||
}
|
||||
|
||||
void batchWriteAndWait(HTable table, int start, boolean expect, int timeout)
|
||||
throws IOException {
|
||||
for (int i = 0; i < 10; i++) {
|
||||
|
@ -462,6 +474,7 @@ public class TestLogRolling {
|
|||
Thread.sleep(1000);
|
||||
dfsCluster.waitActive();
|
||||
LOG.info("Data Nodes restarted");
|
||||
validateData(table, 1002);
|
||||
|
||||
// this write should succeed, but trigger a log roll
|
||||
writeData(table, 1003);
|
||||
|
@ -469,6 +482,7 @@ public class TestLogRolling {
|
|||
|
||||
assertTrue("Missing datanode should've triggered a log roll",
|
||||
newFilenum > oldFilenum && newFilenum > curTime);
|
||||
validateData(table, 1003);
|
||||
|
||||
writeData(table, 1004);
|
||||
|
||||
|
@ -477,6 +491,7 @@ public class TestLogRolling {
|
|||
Thread.sleep(1000);
|
||||
dfsCluster.waitActive();
|
||||
LOG.info("Data Nodes restarted");
|
||||
validateData(table, 1004);
|
||||
|
||||
// this write should succeed, but trigger a log roll
|
||||
writeData(table, 1005);
|
||||
|
|
Loading…
Reference in New Issue