[RECOVERY] Increment Store refcount on RecoveryTarget

We should make sure we have incremented the store refcount before we start the recovery on the recovyer target. Closes #6844
2014-07-13 22:01:26 +02:00 · 2014-07-13 22:01:26 +02:00 · e8ff007852
parent ab11c6821d
commit e8ff007852
2 changed files with 123 additions and 99 deletions
--- a/src/main/java/org/elasticsearch/index/store/Store.java
+++ b/src/main/java/org/elasticsearch/index/store/Store.java
@ -169,16 +169,36 @@ public class Store extends AbstractIndexShardComponent implements CloseableIndex
     *
     * Note: Close can safely be called multiple times.
     * @see #decRef
     * @see #tryIncRef()
     * @throws AlreadyClosedException iff the reference counter can not be incremented.
     */
    public final void incRef() {
        if (tryIncRef() == false) {
            throw new AlreadyClosedException("Store is already closed can't increment refCount current count [" + refCount.get() + "]");
        }
    }
    /**
     * Tries to increment the refCount of this Store instance. This method will return <tt>true</tt> iff the refCount was
     * incremented successfully otherwise <tt>false</tt>. RefCounts are used to determine when a
     * Store can be closed safely, i.e. as soon as there are no more references. Be sure to always call a
     * corresponding {@link #decRef}, in a finally clause; otherwise the store may never be closed.  Note that
     * {@link #close} simply calls decRef(), which means that the Store will not really be closed until {@link
     * #decRef} has been called for all outstanding references.
     *
     * Note: Close can safely be called multiple times.
     * @see #decRef()
     * @see #incRef()
     */
    public final boolean tryIncRef() {
        do {
            int i = refCount.get();
            if (i > 0) {
                if (refCount.compareAndSet(i, i + 1)) {
-                    return;
+                    return true;
                }
            } else {
-                throw new AlreadyClosedException("Store is already closed can't increment refCount current count [" + i + "]");
+                return false;
            }
        } while (true);
    }
--- a/src/main/java/org/elasticsearch/indices/recovery/RecoveryTarget.java
+++ b/src/main/java/org/elasticsearch/indices/recovery/RecoveryTarget.java
@ -172,7 +172,7 @@ public class RecoveryTarget extends AbstractComponent {
        threadPool.generic().execute(new Runnable() {
            @Override
            public void run() {
-                doRecovery(request, recoveryStatus, listener);
+              doRecovery(request, recoveryStatus, listener);
            }
        });
    }
@ -188,7 +188,6 @@ public class RecoveryTarget extends AbstractComponent {
    private void doRecovery(final StartRecoveryRequest request, final RecoveryStatus recoveryStatus, final RecoveryListener listener) {
        assert request.sourceNode() != null : "can't do a recovery without a source node";
        final InternalIndexShard shard = recoveryStatus.indexShard;
        if (shard == null) {
            listener.onIgnoreRecovery(false, "shard missing locally, stop recovery");
@ -205,106 +204,111 @@ public class RecoveryTarget extends AbstractComponent {
        }
        recoveryStatus.recoveryThread = Thread.currentThread();
        if (shard.store().tryIncRef()) {
            try {
                logger.trace("[{}][{}] starting recovery from {}", request.shardId().index().name(), request.shardId().id(), request.sourceNode());
-        try {
+                StopWatch stopWatch = new StopWatch().start();
-            logger.trace("[{}][{}] starting recovery from {}", request.shardId().index().name(), request.shardId().id(), request.sourceNode());
+                RecoveryResponse recoveryResponse = transportService.submitRequest(request.sourceNode(), RecoverySource.Actions.START_RECOVERY, request, new FutureTransportResponseHandler<RecoveryResponse>() {
-
+                    @Override
-            StopWatch stopWatch = new StopWatch().start();
+                    public RecoveryResponse newInstance() {
-            RecoveryResponse recoveryResponse = transportService.submitRequest(request.sourceNode(), RecoverySource.Actions.START_RECOVERY, request, new FutureTransportResponseHandler<RecoveryResponse>() {
+                        return new RecoveryResponse();
-                @Override
+                    }
-                public RecoveryResponse newInstance() {
+                }).txGet();
-                    return new RecoveryResponse();
+                if (shard.state() == IndexShardState.CLOSED) {
                    removeAndCleanOnGoingRecovery(recoveryStatus);
                    listener.onIgnoreRecovery(false, "local shard closed, stop recovery");
                    return;
                }
                stopWatch.stop();
                if (logger.isTraceEnabled()) {
                    StringBuilder sb = new StringBuilder();
                    sb.append('[').append(request.shardId().index().name()).append(']').append('[').append(request.shardId().id()).append("] ");
                    sb.append("recovery completed from ").append(request.sourceNode()).append(", took[").append(stopWatch.totalTime()).append("]\n");
                    sb.append("   phase1: recovered_files [").append(recoveryResponse.phase1FileNames.size()).append("]").append(" with total_size of [").append(new ByteSizeValue(recoveryResponse.phase1TotalSize)).append("]")
                            .append(", took [").append(timeValueMillis(recoveryResponse.phase1Time)).append("], throttling_wait [").append(timeValueMillis(recoveryResponse.phase1ThrottlingWaitTime)).append(']')
                            .append("\n");
                    sb.append("         : reusing_files   [").append(recoveryResponse.phase1ExistingFileNames.size()).append("] with total_size of [").append(new ByteSizeValue(recoveryResponse.phase1ExistingTotalSize)).append("]\n");
                    sb.append("   phase2: start took [").append(timeValueMillis(recoveryResponse.startTime)).append("]\n");
                    sb.append("         : recovered [").append(recoveryResponse.phase2Operations).append("]").append(" transaction log operations")
                            .append(", took [").append(timeValueMillis(recoveryResponse.phase2Time)).append("]")
                            .append("\n");
                    sb.append("   phase3: recovered [").append(recoveryResponse.phase3Operations).append("]").append(" transaction log operations")
                            .append(", took [").append(timeValueMillis(recoveryResponse.phase3Time)).append("]");
                    logger.trace(sb.toString());
                } else if (logger.isDebugEnabled()) {
                    logger.debug("{} recovery completed from [{}], took [{}]", request.shardId(), request.sourceNode(), stopWatch.totalTime());
                }
            }).txGet();
            if (shard.state() == IndexShardState.CLOSED) {
                removeAndCleanOnGoingRecovery(recoveryStatus);
-                listener.onIgnoreRecovery(false, "local shard closed, stop recovery");
+                listener.onRecoveryDone();
-                return;
+            } catch (Throwable e) {
-            }
+                if (logger.isTraceEnabled()) {
-            stopWatch.stop();
+                    logger.trace("[{}][{}] Got exception on recovery", e, request.shardId().index().name(), request.shardId().id());
-            if (logger.isTraceEnabled()) {
+                }
-                StringBuilder sb = new StringBuilder();
+                if (recoveryStatus.isCanceled()) {
-                sb.append('[').append(request.shardId().index().name()).append(']').append('[').append(request.shardId().id()).append("] ");
+                    // don't remove it, the cancellation code will remove it...
-                sb.append("recovery completed from ").append(request.sourceNode()).append(", took[").append(stopWatch.totalTime()).append("]\n");
+                    listener.onIgnoreRecovery(false, "canceled recovery");
-                sb.append("   phase1: recovered_files [").append(recoveryResponse.phase1FileNames.size()).append("]").append(" with total_size of [").append(new ByteSizeValue(recoveryResponse.phase1TotalSize)).append("]")
+                    return;
-                        .append(", took [").append(timeValueMillis(recoveryResponse.phase1Time)).append("], throttling_wait [").append(timeValueMillis(recoveryResponse.phase1ThrottlingWaitTime)).append(']')
+                }
-                        .append("\n");
+                if (shard.state() == IndexShardState.CLOSED) {
-                sb.append("         : reusing_files   [").append(recoveryResponse.phase1ExistingFileNames.size()).append("] with total_size of [").append(new ByteSizeValue(recoveryResponse.phase1ExistingTotalSize)).append("]\n");
+                    removeAndCleanOnGoingRecovery(recoveryStatus);
-                sb.append("   phase2: start took [").append(timeValueMillis(recoveryResponse.startTime)).append("]\n");
+                    listener.onIgnoreRecovery(false, "local shard closed, stop recovery");
-                sb.append("         : recovered [").append(recoveryResponse.phase2Operations).append("]").append(" transaction log operations")
+                    return;
-                        .append(", took [").append(timeValueMillis(recoveryResponse.phase2Time)).append("]")
+                }
-                        .append("\n");
+                Throwable cause = ExceptionsHelper.unwrapCause(e);
-                sb.append("   phase3: recovered [").append(recoveryResponse.phase3Operations).append("]").append(" transaction log operations")
+                if (cause instanceof RecoveryEngineException) {
-                        .append(", took [").append(timeValueMillis(recoveryResponse.phase3Time)).append("]");
+                    // unwrap an exception that was thrown as part of the recovery
-                logger.trace(sb.toString());
+                    cause = cause.getCause();
-            } else if (logger.isDebugEnabled()) {
+                }
-                logger.debug("{} recovery completed from [{}], took [{}]", request.shardId(), request.sourceNode(), stopWatch.totalTime());
+                // do it twice, in case we have double transport exception
-            }
+                cause = ExceptionsHelper.unwrapCause(cause);
-            removeAndCleanOnGoingRecovery(recoveryStatus);
+                if (cause instanceof RecoveryEngineException) {
-            listener.onRecoveryDone();
+                    // unwrap an exception that was thrown as part of the recovery
-        } catch (Throwable e) {
+                    cause = cause.getCause();
-            if (logger.isTraceEnabled()) {
+                }
-                logger.trace("[{}][{}] Got exception on recovery", e, request.shardId().index().name(), request.shardId().id());
+
-            }
+                // here, we would add checks against exception that need to be retried (and not removeAndClean in this case)
-            if (recoveryStatus.isCanceled()) {
+
-                // don't remove it, the cancellation code will remove it...
+                if (cause instanceof IndexShardNotStartedException || cause instanceof IndexMissingException || cause instanceof IndexShardMissingException) {
-                listener.onIgnoreRecovery(false, "canceled recovery");
+                    // if the target is not ready yet, retry
-                return;
+                    listener.onRetryRecovery(TimeValue.timeValueMillis(500), recoveryStatus);
-            }
+                    return;
-            if (shard.state() == IndexShardState.CLOSED) {
+                }
                if (cause instanceof DelayRecoveryException) {
                    listener.onRetryRecovery(TimeValue.timeValueMillis(500), recoveryStatus);
                    return;
                }
                // here, we check against ignore recovery options
                // in general, no need to clean the shard on ignored recovery, since we want to try and reuse it later
                // it will get deleted in the IndicesStore if all are allocated and no shard exists on this node...
                removeAndCleanOnGoingRecovery(recoveryStatus);
-                listener.onIgnoreRecovery(false, "local shard closed, stop recovery");
+
-                return;
+                if (cause instanceof ConnectTransportException) {
                    listener.onIgnoreRecovery(true, "source node disconnected (" + request.sourceNode() + ")");
                    return;
                }
                if (cause instanceof IndexShardClosedException) {
                    listener.onIgnoreRecovery(true, "source shard is closed (" + request.sourceNode() + ")");
                    return;
                }
                if (cause instanceof AlreadyClosedException) {
                    listener.onIgnoreRecovery(true, "source shard is closed (" + request.sourceNode() + ")");
                    return;
                }
                logger.warn("[{}][{}] recovery from [{}] failed", e, request.shardId().index().name(), request.shardId().id(), request.sourceNode());
                listener.onRecoveryFailure(new RecoveryFailedException(request, e), true);
            } finally {
                shard.store().decRef();
            }
-            Throwable cause = ExceptionsHelper.unwrapCause(e);
+        } else {
-            if (cause instanceof RecoveryEngineException) {
+            listener.onIgnoreRecovery(false, "local store closed, stop recovery");
                // unwrap an exception that was thrown as part of the recovery
                cause = cause.getCause();
            }
            // do it twice, in case we have double transport exception
            cause = ExceptionsHelper.unwrapCause(cause);
            if (cause instanceof RecoveryEngineException) {
                // unwrap an exception that was thrown as part of the recovery
                cause = cause.getCause();
            }
            // here, we would add checks against exception that need to be retried (and not removeAndClean in this case)
            if (cause instanceof IndexShardNotStartedException || cause instanceof IndexMissingException || cause instanceof IndexShardMissingException) {
                // if the target is not ready yet, retry
                listener.onRetryRecovery(TimeValue.timeValueMillis(500), recoveryStatus);
                return;
            }
            if (cause instanceof DelayRecoveryException) {
                listener.onRetryRecovery(TimeValue.timeValueMillis(500), recoveryStatus);
                return;
            }
            // here, we check against ignore recovery options
            // in general, no need to clean the shard on ignored recovery, since we want to try and reuse it later
            // it will get deleted in the IndicesStore if all are allocated and no shard exists on this node...
            removeAndCleanOnGoingRecovery(recoveryStatus);
            if (cause instanceof ConnectTransportException) {
                listener.onIgnoreRecovery(true, "source node disconnected (" + request.sourceNode() + ")");
                return;
            }
            if (cause instanceof IndexShardClosedException) {
                listener.onIgnoreRecovery(true, "source shard is closed (" + request.sourceNode() + ")");
                return;
            }
            if (cause instanceof AlreadyClosedException) {
                listener.onIgnoreRecovery(true, "source shard is closed (" + request.sourceNode() + ")");
                return;
            }
            logger.warn("[{}][{}] recovery from [{}] failed", e, request.shardId().index().name(), request.shardId().id(), request.sourceNode());
            listener.onRecoveryFailure(new RecoveryFailedException(request, e), true);
        }
    }