HBASE-13877 Interrupt to flush from TableFlushProcedure causes dataloss in ITBLL
This commit is contained in:
parent
682b8ab8a5
commit
c6dd3f965b
|
@ -219,7 +219,7 @@ public class IntegrationTestBigLinkedList extends IntegrationTestBase {
|
||||||
|
|
||||||
protected int NUM_SLAVES_BASE = 3; // number of slaves for the cluster
|
protected int NUM_SLAVES_BASE = 3; // number of slaves for the cluster
|
||||||
|
|
||||||
private static final int MISSING_ROWS_TO_LOG = 2; // YARN complains when too many counters
|
private static final int MISSING_ROWS_TO_LOG = 10; // YARN complains when too many counters
|
||||||
|
|
||||||
private static final int WIDTH_DEFAULT = 1000000;
|
private static final int WIDTH_DEFAULT = 1000000;
|
||||||
private static final int WRAP_DEFAULT = 25;
|
private static final int WRAP_DEFAULT = 25;
|
||||||
|
@ -665,6 +665,7 @@ public class IntegrationTestBigLinkedList extends IntegrationTestBase {
|
||||||
*/
|
*/
|
||||||
public static class WALMapperSearcher extends WALMapper {
|
public static class WALMapperSearcher extends WALMapper {
|
||||||
private SortedSet<byte []> keysToFind;
|
private SortedSet<byte []> keysToFind;
|
||||||
|
private AtomicInteger rows = new AtomicInteger(0);
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public void setup(Mapper<WALKey, WALEdit, ImmutableBytesWritable, Mutation>.Context context)
|
public void setup(Mapper<WALKey, WALEdit, ImmutableBytesWritable, Mutation>.Context context)
|
||||||
|
@ -686,8 +687,15 @@ public class IntegrationTestBigLinkedList extends IntegrationTestBase {
|
||||||
boolean b = this.keysToFind.contains(row);
|
boolean b = this.keysToFind.contains(row);
|
||||||
if (b) {
|
if (b) {
|
||||||
String keyStr = Bytes.toStringBinary(row);
|
String keyStr = Bytes.toStringBinary(row);
|
||||||
LOG.info("Found cell=" + cell);
|
try {
|
||||||
context.getCounter(FOUND_GROUP_KEY, keyStr).increment(1);
|
LOG.info("Found cell=" + cell + " , walKey=" + context.getCurrentKey());
|
||||||
|
} catch (IOException|InterruptedException e) {
|
||||||
|
LOG.warn(e);
|
||||||
|
}
|
||||||
|
if (rows.addAndGet(1) < MISSING_ROWS_TO_LOG) {
|
||||||
|
context.getCounter(FOUND_GROUP_KEY, keyStr).increment(1);
|
||||||
|
}
|
||||||
|
context.getCounter(FOUND_GROUP_KEY, "CELL_WITH_MISSING_ROW").increment(1);
|
||||||
}
|
}
|
||||||
return b;
|
return b;
|
||||||
}
|
}
|
||||||
|
|
|
@ -29,7 +29,6 @@ import org.apache.hadoop.hbase.procedure.ProcedureMember;
|
||||||
import org.apache.hadoop.hbase.procedure.Subprocedure;
|
import org.apache.hadoop.hbase.procedure.Subprocedure;
|
||||||
import org.apache.hadoop.hbase.procedure.flush.RegionServerFlushTableProcedureManager.FlushTableSubprocedurePool;
|
import org.apache.hadoop.hbase.procedure.flush.RegionServerFlushTableProcedureManager.FlushTableSubprocedurePool;
|
||||||
import org.apache.hadoop.hbase.regionserver.Region;
|
import org.apache.hadoop.hbase.regionserver.Region;
|
||||||
import org.apache.hadoop.hbase.regionserver.Region.Operation;
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This flush region implementation uses the distributed procedure framework to flush
|
* This flush region implementation uses the distributed procedure framework to flush
|
||||||
|
|
|
@ -33,7 +33,9 @@ import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.hadoop.hbase.classification.InterfaceAudience;
|
import org.apache.hadoop.hbase.classification.InterfaceAudience;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.hbase.Abortable;
|
||||||
import org.apache.hadoop.hbase.DaemonThreadFactory;
|
import org.apache.hadoop.hbase.DaemonThreadFactory;
|
||||||
|
import org.apache.hadoop.hbase.DroppedSnapshotException;
|
||||||
import org.apache.hadoop.hbase.HBaseInterfaceAudience;
|
import org.apache.hadoop.hbase.HBaseInterfaceAudience;
|
||||||
import org.apache.hadoop.hbase.TableName;
|
import org.apache.hadoop.hbase.TableName;
|
||||||
import org.apache.hadoop.hbase.errorhandling.ForeignException;
|
import org.apache.hadoop.hbase.errorhandling.ForeignException;
|
||||||
|
@ -157,7 +159,7 @@ public class RegionServerFlushTableProcedureManager extends RegionServerProcedur
|
||||||
FLUSH_REQUEST_WAKE_MILLIS_DEFAULT);
|
FLUSH_REQUEST_WAKE_MILLIS_DEFAULT);
|
||||||
|
|
||||||
FlushTableSubprocedurePool taskManager =
|
FlushTableSubprocedurePool taskManager =
|
||||||
new FlushTableSubprocedurePool(rss.getServerName().toString(), conf);
|
new FlushTableSubprocedurePool(rss.getServerName().toString(), conf, rss);
|
||||||
return new FlushTableSubprocedure(member, exnDispatcher, wakeMillis,
|
return new FlushTableSubprocedure(member, exnDispatcher, wakeMillis,
|
||||||
timeoutMillis, involvedRegions, table, taskManager);
|
timeoutMillis, involvedRegions, table, taskManager);
|
||||||
}
|
}
|
||||||
|
@ -195,13 +197,15 @@ public class RegionServerFlushTableProcedureManager extends RegionServerProcedur
|
||||||
* failures.
|
* failures.
|
||||||
*/
|
*/
|
||||||
static class FlushTableSubprocedurePool {
|
static class FlushTableSubprocedurePool {
|
||||||
|
private final Abortable abortable;
|
||||||
private final ExecutorCompletionService<Void> taskPool;
|
private final ExecutorCompletionService<Void> taskPool;
|
||||||
private final ThreadPoolExecutor executor;
|
private final ThreadPoolExecutor executor;
|
||||||
private volatile boolean stopped;
|
private volatile boolean stopped;
|
||||||
private final List<Future<Void>> futures = new ArrayList<Future<Void>>();
|
private final List<Future<Void>> futures = new ArrayList<Future<Void>>();
|
||||||
private final String name;
|
private final String name;
|
||||||
|
|
||||||
FlushTableSubprocedurePool(String name, Configuration conf) {
|
FlushTableSubprocedurePool(String name, Configuration conf, Abortable abortable) {
|
||||||
|
this.abortable = abortable;
|
||||||
// configure the executor service
|
// configure the executor service
|
||||||
long keepAlive = conf.getLong(
|
long keepAlive = conf.getLong(
|
||||||
RegionServerFlushTableProcedureManager.FLUSH_TIMEOUT_MILLIS_KEY,
|
RegionServerFlushTableProcedureManager.FLUSH_TIMEOUT_MILLIS_KEY,
|
||||||
|
@ -259,9 +263,13 @@ public class RegionServerFlushTableProcedureManager extends RegionServerProcedur
|
||||||
}
|
}
|
||||||
// we are stopped so we can just exit.
|
// we are stopped so we can just exit.
|
||||||
} catch (ExecutionException e) {
|
} catch (ExecutionException e) {
|
||||||
if (e.getCause() instanceof ForeignException) {
|
Throwable cause = e.getCause();
|
||||||
|
if (cause instanceof ForeignException) {
|
||||||
LOG.warn("Rethrowing ForeignException from FlushSubprocedurePool", e);
|
LOG.warn("Rethrowing ForeignException from FlushSubprocedurePool", e);
|
||||||
throw (ForeignException)e.getCause();
|
throw (ForeignException)e.getCause();
|
||||||
|
} else if (cause instanceof DroppedSnapshotException) {
|
||||||
|
// we have to abort the region server according to contract of flush
|
||||||
|
abortable.abort("Received DroppedSnapshotException, aborting", cause);
|
||||||
}
|
}
|
||||||
LOG.warn("Got Exception in FlushSubprocedurePool", e);
|
LOG.warn("Got Exception in FlushSubprocedurePool", e);
|
||||||
throw new ForeignException(name, e.getCause());
|
throw new ForeignException(name, e.getCause());
|
||||||
|
@ -272,7 +280,8 @@ public class RegionServerFlushTableProcedureManager extends RegionServerProcedur
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* This attempts to cancel out all pending and in progress tasks (interruptions issues)
|
* This attempts to cancel out all pending and in progress tasks. Does not interrupt the running
|
||||||
|
* tasks itself. An ongoing HRegion.flush() should not be interrupted (see HBASE-13877).
|
||||||
* @throws InterruptedException
|
* @throws InterruptedException
|
||||||
*/
|
*/
|
||||||
void cancelTasks() throws InterruptedException {
|
void cancelTasks() throws InterruptedException {
|
||||||
|
@ -289,13 +298,14 @@ public class RegionServerFlushTableProcedureManager extends RegionServerProcedur
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Abruptly shutdown the thread pool. Call when exiting a region server.
|
* Gracefully shutdown the thread pool. An ongoing HRegion.flush() should not be
|
||||||
|
* interrupted (see HBASE-13877)
|
||||||
*/
|
*/
|
||||||
void stop() {
|
void stop() {
|
||||||
if (this.stopped) return;
|
if (this.stopped) return;
|
||||||
|
|
||||||
this.stopped = true;
|
this.stopped = true;
|
||||||
this.executor.shutdownNow();
|
this.executor.shutdown();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -475,6 +475,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
|
||||||
* FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_NO_COMPACTION_NEEDED.
|
* FLUSHED_NO_COMPACTION_NEEDED or FLUSHED_NO_COMPACTION_NEEDED.
|
||||||
* @return true if the memstores were flushed, else false.
|
* @return true if the memstores were flushed, else false.
|
||||||
*/
|
*/
|
||||||
|
@Override
|
||||||
public boolean isFlushSucceeded() {
|
public boolean isFlushSucceeded() {
|
||||||
return result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result
|
return result == Result.FLUSHED_NO_COMPACTION_NEEDED || result == Result
|
||||||
.FLUSHED_COMPACTION_NEEDED;
|
.FLUSHED_COMPACTION_NEEDED;
|
||||||
|
@ -484,6 +485,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
|
||||||
* Convenience method, the equivalent of checking if result is FLUSHED_COMPACTION_NEEDED.
|
* Convenience method, the equivalent of checking if result is FLUSHED_COMPACTION_NEEDED.
|
||||||
* @return True if the flush requested a compaction, else false (doesn't even mean it flushed).
|
* @return True if the flush requested a compaction, else false (doesn't even mean it flushed).
|
||||||
*/
|
*/
|
||||||
|
@Override
|
||||||
public boolean isCompactionNeeded() {
|
public boolean isCompactionNeeded() {
|
||||||
return result == Result.FLUSHED_COMPACTION_NEEDED;
|
return result == Result.FLUSHED_COMPACTION_NEEDED;
|
||||||
}
|
}
|
||||||
|
@ -1272,6 +1274,9 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
|
||||||
* vector if already closed and null if judged that it should not close.
|
* vector if already closed and null if judged that it should not close.
|
||||||
*
|
*
|
||||||
* @throws IOException e
|
* @throws IOException e
|
||||||
|
* @throws DroppedSnapshotException Thrown when replay of wal is required
|
||||||
|
* because a Snapshot was not properly persisted. The region is put in closing mode, and the
|
||||||
|
* caller MUST abort after this.
|
||||||
*/
|
*/
|
||||||
public Map<byte[], List<StoreFile>> close() throws IOException {
|
public Map<byte[], List<StoreFile>> close() throws IOException {
|
||||||
return close(false);
|
return close(false);
|
||||||
|
@ -1309,6 +1314,9 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
|
||||||
* we are not to close at this time or we are already closed.
|
* we are not to close at this time or we are already closed.
|
||||||
*
|
*
|
||||||
* @throws IOException e
|
* @throws IOException e
|
||||||
|
* @throws DroppedSnapshotException Thrown when replay of wal is required
|
||||||
|
* because a Snapshot was not properly persisted. The region is put in closing mode, and the
|
||||||
|
* caller MUST abort after this.
|
||||||
*/
|
*/
|
||||||
public Map<byte[], List<StoreFile>> close(final boolean abort) throws IOException {
|
public Map<byte[], List<StoreFile>> close(final boolean abort) throws IOException {
|
||||||
// Only allow one thread to close at a time. Serialize them so dual
|
// Only allow one thread to close at a time. Serialize them so dual
|
||||||
|
@ -1327,6 +1335,14 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Exposed for some very specific unit tests.
|
||||||
|
*/
|
||||||
|
@VisibleForTesting
|
||||||
|
public void setClosing(boolean closing) {
|
||||||
|
this.closing.set(closing);
|
||||||
|
}
|
||||||
|
|
||||||
private Map<byte[], List<StoreFile>> doClose(final boolean abort, MonitoredTask status)
|
private Map<byte[], List<StoreFile>> doClose(final boolean abort, MonitoredTask status)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
if (isClosed()) {
|
if (isClosed()) {
|
||||||
|
@ -1826,7 +1842,8 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
|
||||||
*
|
*
|
||||||
* @throws IOException general io exceptions
|
* @throws IOException general io exceptions
|
||||||
* @throws DroppedSnapshotException Thrown when replay of wal is required
|
* @throws DroppedSnapshotException Thrown when replay of wal is required
|
||||||
* because a Snapshot was not properly persisted.
|
* because a Snapshot was not properly persisted. The region is put in closing mode, and the
|
||||||
|
* caller MUST abort after this.
|
||||||
*/
|
*/
|
||||||
public FlushResult flushcache(boolean forceFlushAllStores, boolean writeFlushRequestWalMarker)
|
public FlushResult flushcache(boolean forceFlushAllStores, boolean writeFlushRequestWalMarker)
|
||||||
throws IOException {
|
throws IOException {
|
||||||
|
@ -2337,6 +2354,18 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
|
||||||
Bytes.toStringBinary(getRegionInfo().getRegionName()));
|
Bytes.toStringBinary(getRegionInfo().getRegionName()));
|
||||||
dse.initCause(t);
|
dse.initCause(t);
|
||||||
status.abort("Flush failed: " + StringUtils.stringifyException(t));
|
status.abort("Flush failed: " + StringUtils.stringifyException(t));
|
||||||
|
|
||||||
|
// Callers for flushcache() should catch DroppedSnapshotException and abort the region server.
|
||||||
|
// However, since we may have the region read lock, we cannot call close(true) here since
|
||||||
|
// we cannot promote to a write lock. Instead we are setting closing so that all other region
|
||||||
|
// operations except for close will be rejected.
|
||||||
|
this.closing.set(true);
|
||||||
|
|
||||||
|
if (rsServices != null) {
|
||||||
|
// This is a safeguard against the case where the caller fails to explicitly handle aborting
|
||||||
|
rsServices.abort("Replay of WAL required. Forcing server shutdown", dse);
|
||||||
|
}
|
||||||
|
|
||||||
throw dse;
|
throw dse;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -6407,6 +6436,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
|
||||||
return results;
|
return results;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@Override
|
||||||
public void mutateRow(RowMutations rm) throws IOException {
|
public void mutateRow(RowMutations rm) throws IOException {
|
||||||
// Don't need nonces here - RowMutations only supports puts and deletes
|
// Don't need nonces here - RowMutations only supports puts and deletes
|
||||||
mutateRowsWithLocks(rm.getMutations(), Collections.singleton(rm.getRow()));
|
mutateRowsWithLocks(rm.getMutations(), Collections.singleton(rm.getRow()));
|
||||||
|
@ -6433,6 +6463,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
|
||||||
* <code>rowsToLock</code> is sorted in order to avoid deadlocks.
|
* <code>rowsToLock</code> is sorted in order to avoid deadlocks.
|
||||||
* @throws IOException
|
* @throws IOException
|
||||||
*/
|
*/
|
||||||
|
@Override
|
||||||
public void mutateRowsWithLocks(Collection<Mutation> mutations,
|
public void mutateRowsWithLocks(Collection<Mutation> mutations,
|
||||||
Collection<byte[]> rowsToLock, long nonceGroup, long nonce) throws IOException {
|
Collection<byte[]> rowsToLock, long nonceGroup, long nonce) throws IOException {
|
||||||
MultiRowMutationProcessor proc = new MultiRowMutationProcessor(mutations, rowsToLock);
|
MultiRowMutationProcessor proc = new MultiRowMutationProcessor(mutations, rowsToLock);
|
||||||
|
@ -7436,6 +7467,7 @@ public class HRegion implements HeapSize, PropagatingConfigurationObserver, Regi
|
||||||
|
|
||||||
|
|
||||||
/** @return the coprocessor host */
|
/** @return the coprocessor host */
|
||||||
|
@Override
|
||||||
public RegionCoprocessorHost getCoprocessorHost() {
|
public RegionCoprocessorHost getCoprocessorHost() {
|
||||||
return coprocessorHost;
|
return coprocessorHost;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1329,6 +1329,9 @@ public class RSRpcServices implements HBaseRPCErrorHandler,
|
||||||
}
|
}
|
||||||
regionServer.compactSplitThread.requestRegionsMerge(regionA, regionB, forcible);
|
regionServer.compactSplitThread.requestRegionsMerge(regionA, regionB, forcible);
|
||||||
return MergeRegionsResponse.newBuilder().build();
|
return MergeRegionsResponse.newBuilder().build();
|
||||||
|
} catch (DroppedSnapshotException ex) {
|
||||||
|
regionServer.abort("Replay of WAL required. Forcing server shutdown", ex);
|
||||||
|
throw new ServiceException(ex);
|
||||||
} catch (IOException ie) {
|
} catch (IOException ie) {
|
||||||
throw new ServiceException(ie);
|
throw new ServiceException(ie);
|
||||||
}
|
}
|
||||||
|
@ -1741,6 +1744,9 @@ public class RSRpcServices implements HBaseRPCErrorHandler,
|
||||||
((HRegion)region).forceSplit(splitPoint);
|
((HRegion)region).forceSplit(splitPoint);
|
||||||
regionServer.compactSplitThread.requestSplit(region, ((HRegion)region).checkSplit());
|
regionServer.compactSplitThread.requestSplit(region, ((HRegion)region).checkSplit());
|
||||||
return SplitRegionResponse.newBuilder().build();
|
return SplitRegionResponse.newBuilder().build();
|
||||||
|
} catch (DroppedSnapshotException ex) {
|
||||||
|
regionServer.abort("Replay of WAL required. Forcing server shutdown", ex);
|
||||||
|
throw new ServiceException(ex);
|
||||||
} catch (IOException ie) {
|
} catch (IOException ie) {
|
||||||
throw new ServiceException(ie);
|
throw new ServiceException(ie);
|
||||||
}
|
}
|
||||||
|
|
|
@ -653,6 +653,8 @@ public interface Region extends ConfigurationObserver {
|
||||||
*
|
*
|
||||||
* @throws IOException general io exceptions
|
* @throws IOException general io exceptions
|
||||||
* because a snapshot was not properly persisted.
|
* because a snapshot was not properly persisted.
|
||||||
|
* @throws DroppedSnapshotException Thrown when abort is required. The caller MUST catch this
|
||||||
|
* exception and MUST abort. Any further operation to the region may cause data loss.
|
||||||
*/
|
*/
|
||||||
FlushResult flush(boolean force) throws IOException;
|
FlushResult flush(boolean force) throws IOException;
|
||||||
|
|
||||||
|
|
|
@ -23,6 +23,7 @@ import java.io.IOException;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.hadoop.hbase.classification.InterfaceAudience;
|
import org.apache.hadoop.hbase.classification.InterfaceAudience;
|
||||||
|
import org.apache.hadoop.hbase.DroppedSnapshotException;
|
||||||
import org.apache.hadoop.hbase.master.TableLockManager.TableLock;
|
import org.apache.hadoop.hbase.master.TableLockManager.TableLock;
|
||||||
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
|
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
|
||||||
import org.apache.hadoop.ipc.RemoteException;
|
import org.apache.hadoop.ipc.RemoteException;
|
||||||
|
@ -93,6 +94,10 @@ class RegionMergeRequest implements Runnable {
|
||||||
+ (this.server.isStopping() ? " stopping" : " stopped"), e);
|
+ (this.server.isStopping() ? " stopping" : " stopped"), e);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (e instanceof DroppedSnapshotException) {
|
||||||
|
server.abort("Replay of WAL required. Forcing server shutdown", e);
|
||||||
|
return;
|
||||||
|
}
|
||||||
try {
|
try {
|
||||||
LOG.warn("Running rollback/cleanup of failed merge of "
|
LOG.warn("Running rollback/cleanup of failed merge of "
|
||||||
+ region_a +" and "+ region_b + "; " + e.getMessage(), e);
|
+ region_a +" and "+ region_b + "; " + e.getMessage(), e);
|
||||||
|
|
|
@ -23,6 +23,7 @@ import java.io.IOException;
|
||||||
import org.apache.commons.logging.Log;
|
import org.apache.commons.logging.Log;
|
||||||
import org.apache.commons.logging.LogFactory;
|
import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.hadoop.hbase.classification.InterfaceAudience;
|
import org.apache.hadoop.hbase.classification.InterfaceAudience;
|
||||||
|
import org.apache.hadoop.hbase.DroppedSnapshotException;
|
||||||
import org.apache.hadoop.hbase.master.TableLockManager.TableLock;
|
import org.apache.hadoop.hbase.master.TableLockManager.TableLock;
|
||||||
import org.apache.hadoop.hbase.util.Bytes;
|
import org.apache.hadoop.hbase.util.Bytes;
|
||||||
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
|
import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
|
||||||
|
@ -91,6 +92,10 @@ class SplitRequest implements Runnable {
|
||||||
+ (this.server.isStopping() ? " stopping" : " stopped"), e);
|
+ (this.server.isStopping() ? " stopping" : " stopped"), e);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (e instanceof DroppedSnapshotException) {
|
||||||
|
server.abort("Replay of WAL required. Forcing server shutdown", e);
|
||||||
|
return;
|
||||||
|
}
|
||||||
try {
|
try {
|
||||||
LOG.info("Running rollback/cleanup of failed split of " +
|
LOG.info("Running rollback/cleanup of failed split of " +
|
||||||
parent.getRegionInfo().getRegionNameAsString() + "; " + e.getMessage(), e);
|
parent.getRegionInfo().getRegionNameAsString() + "; " + e.getMessage(), e);
|
||||||
|
|
|
@ -35,7 +35,9 @@ import org.apache.commons.logging.LogFactory;
|
||||||
import org.apache.hadoop.hbase.classification.InterfaceAudience;
|
import org.apache.hadoop.hbase.classification.InterfaceAudience;
|
||||||
import org.apache.hadoop.hbase.classification.InterfaceStability;
|
import org.apache.hadoop.hbase.classification.InterfaceStability;
|
||||||
import org.apache.hadoop.conf.Configuration;
|
import org.apache.hadoop.conf.Configuration;
|
||||||
|
import org.apache.hadoop.hbase.Abortable;
|
||||||
import org.apache.hadoop.hbase.DaemonThreadFactory;
|
import org.apache.hadoop.hbase.DaemonThreadFactory;
|
||||||
|
import org.apache.hadoop.hbase.DroppedSnapshotException;
|
||||||
import org.apache.hadoop.hbase.HBaseInterfaceAudience;
|
import org.apache.hadoop.hbase.HBaseInterfaceAudience;
|
||||||
import org.apache.hadoop.hbase.TableName;
|
import org.apache.hadoop.hbase.TableName;
|
||||||
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
|
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
|
||||||
|
@ -184,7 +186,7 @@ public class RegionServerSnapshotManager extends RegionServerProcedureManager {
|
||||||
switch (snapshot.getType()) {
|
switch (snapshot.getType()) {
|
||||||
case FLUSH:
|
case FLUSH:
|
||||||
SnapshotSubprocedurePool taskManager =
|
SnapshotSubprocedurePool taskManager =
|
||||||
new SnapshotSubprocedurePool(rss.getServerName().toString(), conf);
|
new SnapshotSubprocedurePool(rss.getServerName().toString(), conf, rss);
|
||||||
return new FlushSnapshotSubprocedure(member, exnDispatcher, wakeMillis,
|
return new FlushSnapshotSubprocedure(member, exnDispatcher, wakeMillis,
|
||||||
timeoutMillis, involvedRegions, snapshot, taskManager);
|
timeoutMillis, involvedRegions, snapshot, taskManager);
|
||||||
case SKIPFLUSH:
|
case SKIPFLUSH:
|
||||||
|
@ -196,7 +198,7 @@ public class RegionServerSnapshotManager extends RegionServerProcedureManager {
|
||||||
* To minimized the code change, class name is not changed.
|
* To minimized the code change, class name is not changed.
|
||||||
*/
|
*/
|
||||||
SnapshotSubprocedurePool taskManager2 =
|
SnapshotSubprocedurePool taskManager2 =
|
||||||
new SnapshotSubprocedurePool(rss.getServerName().toString(), conf);
|
new SnapshotSubprocedurePool(rss.getServerName().toString(), conf, rss);
|
||||||
return new FlushSnapshotSubprocedure(member, exnDispatcher, wakeMillis,
|
return new FlushSnapshotSubprocedure(member, exnDispatcher, wakeMillis,
|
||||||
timeoutMillis, involvedRegions, snapshot, taskManager2);
|
timeoutMillis, involvedRegions, snapshot, taskManager2);
|
||||||
|
|
||||||
|
@ -265,13 +267,15 @@ public class RegionServerSnapshotManager extends RegionServerProcedureManager {
|
||||||
* operations such as compactions and replication sinks.
|
* operations such as compactions and replication sinks.
|
||||||
*/
|
*/
|
||||||
static class SnapshotSubprocedurePool {
|
static class SnapshotSubprocedurePool {
|
||||||
|
private final Abortable abortable;
|
||||||
private final ExecutorCompletionService<Void> taskPool;
|
private final ExecutorCompletionService<Void> taskPool;
|
||||||
private final ThreadPoolExecutor executor;
|
private final ThreadPoolExecutor executor;
|
||||||
private volatile boolean stopped;
|
private volatile boolean stopped;
|
||||||
private final List<Future<Void>> futures = new ArrayList<Future<Void>>();
|
private final List<Future<Void>> futures = new ArrayList<Future<Void>>();
|
||||||
private final String name;
|
private final String name;
|
||||||
|
|
||||||
SnapshotSubprocedurePool(String name, Configuration conf) {
|
SnapshotSubprocedurePool(String name, Configuration conf, Abortable abortable) {
|
||||||
|
this.abortable = abortable;
|
||||||
// configure the executor service
|
// configure the executor service
|
||||||
long keepAlive = conf.getLong(
|
long keepAlive = conf.getLong(
|
||||||
RegionServerSnapshotManager.SNAPSHOT_TIMEOUT_MILLIS_KEY,
|
RegionServerSnapshotManager.SNAPSHOT_TIMEOUT_MILLIS_KEY,
|
||||||
|
@ -331,9 +335,13 @@ public class RegionServerSnapshotManager extends RegionServerProcedureManager {
|
||||||
}
|
}
|
||||||
// we are stopped so we can just exit.
|
// we are stopped so we can just exit.
|
||||||
} catch (ExecutionException e) {
|
} catch (ExecutionException e) {
|
||||||
if (e.getCause() instanceof ForeignException) {
|
Throwable cause = e.getCause();
|
||||||
|
if (cause instanceof ForeignException) {
|
||||||
LOG.warn("Rethrowing ForeignException from SnapshotSubprocedurePool", e);
|
LOG.warn("Rethrowing ForeignException from SnapshotSubprocedurePool", e);
|
||||||
throw (ForeignException)e.getCause();
|
throw (ForeignException)e.getCause();
|
||||||
|
} else if (cause instanceof DroppedSnapshotException) {
|
||||||
|
// we have to abort the region server according to contract of flush
|
||||||
|
abortable.abort("Received DroppedSnapshotException, aborting", cause);
|
||||||
}
|
}
|
||||||
LOG.warn("Got Exception in SnapshotSubprocedurePool", e);
|
LOG.warn("Got Exception in SnapshotSubprocedurePool", e);
|
||||||
throw new ForeignException(name, e.getCause());
|
throw new ForeignException(name, e.getCause());
|
||||||
|
@ -371,7 +379,7 @@ public class RegionServerSnapshotManager extends RegionServerProcedureManager {
|
||||||
if (this.stopped) return;
|
if (this.stopped) return;
|
||||||
|
|
||||||
this.stopped = true;
|
this.stopped = true;
|
||||||
this.executor.shutdownNow();
|
this.executor.shutdown();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -412,6 +412,7 @@ public class TestHRegion {
|
||||||
Assert.fail("Didn't bubble up IOE!");
|
Assert.fail("Didn't bubble up IOE!");
|
||||||
} catch (DroppedSnapshotException dse) {
|
} catch (DroppedSnapshotException dse) {
|
||||||
// What we are expecting
|
// What we are expecting
|
||||||
|
region.closing.set(false); // this is needed for the rest of the test to work
|
||||||
}
|
}
|
||||||
// Make it so all writes succeed from here on out
|
// Make it so all writes succeed from here on out
|
||||||
ffs.fault.set(false);
|
ffs.fault.set(false);
|
||||||
|
|
|
@ -714,6 +714,8 @@ public class TestWALReplay {
|
||||||
+ t.getMessage());
|
+ t.getMessage());
|
||||||
// simulated to abort server
|
// simulated to abort server
|
||||||
Mockito.doReturn(true).when(rsServices).isAborted();
|
Mockito.doReturn(true).when(rsServices).isAborted();
|
||||||
|
region.setClosing(false); // region normally does not accept writes after
|
||||||
|
// DroppedSnapshotException. We mock around it for this test.
|
||||||
}
|
}
|
||||||
// writing more data
|
// writing more data
|
||||||
int moreRow = 10;
|
int moreRow = 10;
|
||||||
|
|
Loading…
Reference in New Issue