Merge branch 'master' into fix/12329

2025-03-25 01:19:02 +00:00 · 2015-07-21 00:29:34 -07:00 · 2015-07-21 00:29:34 -07:00 · 8cd03cce5e
commit 8cd03cce5e
parent 1c99626b84 f4ba5ac6b8
68 changed files with 2684 additions and 1220 deletions
--- a/core/pom.xml
+++ b/core/pom.xml
@ -299,39 +299,6 @@
        </testResources>

        <plugins>
-          <plugin>
-            <groupId>org.codehaus.mojo</groupId>
-            <artifactId>exec-maven-plugin</artifactId>
-            <version>1.4.0</version>
-            <executions>
-              <execution>
-                <goals>
-                  <goal>exec</goal>
-                </goals>
-              </execution>
-            </executions>
-            <configuration>
-              <executable>${jvm.executable}</executable>
-              <arguments>
-                <argument>-Des.security.manager.enabled=false</argument>
-                <argument>-classpath</argument>
-                <classpath/>
-                <argument>org.elasticsearch.bootstrap.Bootstrap</argument>
-                <argument>-Xms256m</argument>
-                <argument>-Xmx1g</argument>
-                <argument>-Djava.awt.headless=true</argument>
-                <argument>-XX:+UseParNewGC</argument>
-                <argument>-XX:+UseConcMarkSweepGC</argument>
-                <argument>-XX:CMSInitiatingOccupancyFraction=75</argument>
-                <argument>-XX:+UseCMSInitiatingOccupancyOnly</argument>
-                <argument>-XX:+HeapDumpOnOutOfMemoryError</argument>
-                <argument>-XX:+DisableExplicitGC</argument>
-                <argument>-Dfile.encoding=UTF-8</argument>
-                <argument>-Djna.nosys=true</argument>
-                <argument>-Delasticsearch</argument>
-              </arguments>
-            </configuration>
-          </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-source-plugin</artifactId>
@ -1025,11 +992,24 @@
                    </execution>
                </executions>
            </plugin>
-            <!-- integration tests -->
+            <!-- execution and integration tests -->
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-antrun-plugin</artifactId>
                <executions>
+                    <!-- start up elasticsearch in foreground -->
+                    <execution>
+                        <id>execute</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>run</goal>
+                        </goals>
+                        <configuration>
+                            <target if="${run}">
+                                <ant antfile="${elasticsearch.integ.antfile}" target="start-foreground"/>
+                            </target>
+                        </configuration>
+                    </execution>
                    <!-- start up external cluster -->
                    <execution>
                        <id>integ-setup</id>
@ -1053,8 +1033,6 @@
                        <configuration>
                            <target>
                                <ant antfile="${elasticsearch.integ.antfile}" target="stop-external-cluster"/>
-                                <!-- TODO: remove this and the xslt when junit4 is fixed -->
-                                <ant antfile="${elasticsearch.integ.antfile}" target="fixup-failsafe-summary"/>
                            </target>
                        </configuration>
                    </execution>
--- a/core/src/main/java/org/elasticsearch/action/admin/indices/shards/TransportIndicesShardStoresAction.java
+++ b/core/src/main/java/org/elasticsearch/action/admin/indices/shards/TransportIndicesShardStoresAction.java
@ -83,6 +83,7 @@ public class TransportIndicesShardStoresAction extends TransportMasterNodeReadAc
        final String[] concreteIndices = indexNameExpressionResolver.concreteIndices(state, request);
        final Set<ShardId> shardIdsToFetch = new HashSet<>();

+        logger.trace("using cluster state version [{}] to determine shards", state.version());
        // collect relevant shard ids of the requested indices for fetching store infos
        for (String index : concreteIndices) {
            IndexRoutingTable indexShardRoutingTables = routingTables.index(index);
--- a/core/src/main/java/org/elasticsearch/cluster/action/shard/ShardStateAction.java
+++ b/core/src/main/java/org/elasticsearch/cluster/action/shard/ShardStateAction.java
@ -145,17 +145,12 @@ public class ShardStateAction extends AbstractComponent {
                    return currentState;
                }

-                final MetaData metaData = currentState.getMetaData();
-
-
                List<FailedRerouteAllocation.FailedShard> shardRoutingsToBeApplied = new ArrayList<>(shardRoutingEntries.size());
-                for (ShardRoutingEntry entry : extractShardsToBeApplied(shardRoutingEntries, "failed", metaData, logger)) {
-                    shardRoutingsToBeApplied.add(new FailedRerouteAllocation.FailedShard(entry.shardRouting, entry.message, entry.failure));
-                }

                // mark all entries as processed
                for (ShardRoutingEntry entry : shardRoutingEntries) {
                    entry.processed = true;
+                    shardRoutingsToBeApplied.add(new FailedRerouteAllocation.FailedShard(entry.shardRouting, entry.message, entry.failure));
                }

                RoutingAllocation.Result routingResult = allocationService.applyFailedShards(currentState, shardRoutingsToBeApplied);
@ -180,31 +175,6 @@ public class ShardStateAction extends AbstractComponent {
        });
    }

-    static List<ShardRoutingEntry> extractShardsToBeApplied(List<ShardRoutingEntry> shardRoutingEntries, String type, MetaData metaData, ESLogger logger) {
-        List<ShardRoutingEntry> shardRoutingsToBeApplied = new ArrayList<>(shardRoutingEntries.size());
-        for (int i = 0; i < shardRoutingEntries.size(); i++) {
-            ShardRoutingEntry shardRoutingEntry = shardRoutingEntries.get(i);
-            ShardRouting shardRouting = shardRoutingEntry.shardRouting;
-            IndexMetaData indexMetaData = metaData.index(shardRouting.index());
-            // if there is no metadata or the current index is not of the right uuid, the index has been deleted while it was being allocated
-            // which is fine, we should just ignore this
-            if (indexMetaData == null) {
-                logger.debug("{} ignoring shard {}, unknown index in {}", shardRouting.shardId(), type, shardRoutingEntry);
-                continue;
-            }
-            if (!indexMetaData.isSameUUID(shardRoutingEntry.indexUUID)) {
-                logger.debug("{} ignoring shard {}, different index uuid, current {}, got {}", shardRouting.shardId(), type, indexMetaData.getIndexUUID(), shardRoutingEntry);
-                continue;
-            }
-
-            // more debug info will be logged by the allocation service
-            logger.trace("{} will apply shard {} {}", shardRouting.shardId(), type, shardRoutingEntry);
-            shardRoutingsToBeApplied.add(shardRoutingEntry);
-        }
-        return shardRoutingsToBeApplied;
-
-    }
-
    private void shardStartedOnMaster(final ShardRoutingEntry shardRoutingEntry) {
        logger.debug("received shard started for {}", shardRoutingEntry);
        // buffer shard started requests, and the state update tasks will simply drain it
@ -230,18 +200,12 @@ public class ShardStateAction extends AbstractComponent {
                            return currentState;
                        }

-                        RoutingTable routingTable = currentState.routingTable();
-                        MetaData metaData = currentState.getMetaData();
-
-
                        List<ShardRouting> shardRoutingToBeApplied = new ArrayList<>(shardRoutingEntries.size());
-                        for (ShardRoutingEntry entry : extractShardsToBeApplied(shardRoutingEntries, "started", metaData, logger)) {
-                            shardRoutingToBeApplied.add(entry.shardRouting);
-                        }

                        // mark all entries as processed
                        for (ShardRoutingEntry entry : shardRoutingEntries) {
                            entry.processed = true;
+                            shardRoutingToBeApplied.add(entry.shardRouting);
                        }

                        if (shardRoutingToBeApplied.isEmpty()) {
--- a/core/src/main/java/org/elasticsearch/cluster/routing/AllocationId.java
+++ b/core/src/main/java/org/elasticsearch/cluster/routing/AllocationId.java
@ -22,6 +22,8 @@ package org.elasticsearch.cluster.routing;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
+import org.elasticsearch.common.xcontent.ToXContent;
+import org.elasticsearch.common.xcontent.XContentBuilder;

 import java.io.IOException;

@ -34,7 +36,7 @@ import java.io.IOException;
 * relocationId. Once relocation is done, the new allocation id is set to the relocationId. This is similar
 * behavior to how ShardRouting#currentNodeId is used.
 */
-public class AllocationId {
+public class AllocationId implements ToXContent {

    private final String id;
    private final String relocationId;
@ -67,7 +69,7 @@ public class AllocationId {
     */
    public static AllocationId newTargetRelocation(AllocationId allocationId) {
        assert allocationId.getRelocationId() != null;
-        return new AllocationId(allocationId.getRelocationId(), null);
+        return new AllocationId(allocationId.getRelocationId(), allocationId.getId());
    }

    /**
@ -81,19 +83,24 @@ public class AllocationId {

    /**
     * Creates a new allocation id representing a cancelled relocation.
-     */
+     *
+     * Note that this is expected to be called on the allocation id
+     * of the *source* shard
+     * */
    public static AllocationId cancelRelocation(AllocationId allocationId) {
        assert allocationId.getRelocationId() != null;
        return new AllocationId(allocationId.getId(), null);
    }

    /**
-     * Creates a new allocation id finalizing a relocation, moving the transient
-     * relocation id to be the actual id.
+     * Creates a new allocation id finalizing a relocation.
+     *
+     * Note that this is expected to be called on the allocation id
+     * of the *target* shard and thus it only needs to clear the relocating id.
     */
    public static AllocationId finishRelocation(AllocationId allocationId) {
        assert allocationId.getRelocationId() != null;
-        return new AllocationId(allocationId.getRelocationId(), null);
+        return new AllocationId(allocationId.getId(), null);
    }

    /**
@ -126,4 +133,20 @@ public class AllocationId {
        result = 31 * result + (relocationId != null ? relocationId.hashCode() : 0);
        return result;
    }
+
+    @Override
+    public String toString() {
+        return "[id=" + id + (relocationId == null ? "" : ", rId=" + relocationId) + "]";
+    }
+
+    @Override
+    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
+        builder.startObject("allocation_id");
+        builder.field("id", id);
+        if (relocationId != null) {
+            builder.field("relocation_id", relocationId);
+        }
+        builder.endObject();
+        return builder;
+    }
 }
--- a/core/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java
+++ b/core/src/main/java/org/elasticsearch/cluster/routing/RoutingNodes.java
@ -52,9 +52,7 @@ public class RoutingNodes implements Iterable<RoutingNode> {

    private final Map<String, RoutingNode> nodesToShards = newHashMap();

-    private final UnassignedShards unassignedShards = new UnassignedShards();
-
-    private final List<ShardRouting> ignoredUnassignedShards = newArrayList();
+    private final UnassignedShards unassignedShards = new UnassignedShards(this);

    private final Map<ShardId, List<ShardRouting>> assignedShards = newHashMap();

@ -185,10 +183,6 @@ public class RoutingNodes implements Iterable<RoutingNode> {
        return !unassignedShards.isEmpty();
    }

-    public List<ShardRouting> ignoredUnassigned() {
-        return this.ignoredUnassignedShards;
-    }
-
    public UnassignedShards unassigned() {
        return this.unassignedShards;
    }
@ -394,15 +388,14 @@ public class RoutingNodes implements Iterable<RoutingNode> {
     * Mark a shard as started and adjusts internal statistics.
     */
    public void started(ShardRouting shard) {
-        if (!shard.active() && shard.relocatingNodeId() == null) {
+        assert !shard.active() : "expected an intializing shard " + shard;
+        if (shard.relocatingNodeId() == null) {
+            // if this is not a target shard for relocation, we need to update statistics
            inactiveShardCount--;
            if (shard.primary()) {
                inactivePrimaryCount--;
            }
-        } else if (shard.relocating()) {
-            relocatingShards--;
        }
-        assert !shard.started();
        shard.moveToStarted();
    }

@ -527,9 +520,11 @@ public class RoutingNodes implements Iterable<RoutingNode> {

    }

-    public final static class UnassignedShards implements Iterable<ShardRouting>  {
+    public static final class UnassignedShards implements Iterable<ShardRouting>  {

+        private final RoutingNodes nodes;
        private final List<ShardRouting> unassigned;
+        private final List<ShardRouting> ignored;

        private int primaries = 0;
        private long transactionId = 0;
@ -537,14 +532,18 @@ public class RoutingNodes implements Iterable<RoutingNode> {
        private final long sourceTransactionId;

        public UnassignedShards(UnassignedShards other) {
+            this.nodes = other.nodes;
            source = other;
            sourceTransactionId = other.transactionId;
            unassigned = new ArrayList<>(other.unassigned);
+            ignored = new ArrayList<>(other.ignored);
            primaries = other.primaries;
        }

-        public UnassignedShards() {
+        public UnassignedShards(RoutingNodes nodes) {
+            this.nodes = nodes;
            unassigned = new ArrayList<>();
+            ignored = new ArrayList<>();
            source = null;
            sourceTransactionId = -1;
        }
@ -557,12 +556,6 @@ public class RoutingNodes implements Iterable<RoutingNode> {
            transactionId++;
        }

-        public void addAll(Collection<ShardRouting> mutableShardRoutings) {
-            for (ShardRouting r : mutableShardRoutings) {
-                add(r);
-            }
-        }
-
        public void sort(Comparator<ShardRouting> comparator) {
            CollectionUtil.timSort(unassigned, comparator);
        }
@ -576,29 +569,87 @@ public class RoutingNodes implements Iterable<RoutingNode> {
        }

        @Override
-        public Iterator<ShardRouting> iterator() {
-            final Iterator<ShardRouting> iterator = unassigned.iterator();
-            return new Iterator<ShardRouting>() {
-                private ShardRouting current;
-                @Override
-                public boolean hasNext() {
-                    return iterator.hasNext();
-                }
+        public UnassignedIterator iterator() {
+            return new UnassignedIterator();
+        }

-                @Override
-                public ShardRouting next() {
-                    return current = iterator.next();
-                }
+        /**
+         * The list of ignored unassigned shards (read only). The ignored unassigned shards
+         * are not part of the formal unassigned list, but are kept around and used to build
+         * back the list of unassigned shards as part of the routing table.
+         */
+        public List<ShardRouting> ignored() {
+            return Collections.unmodifiableList(ignored);
+        }

-                @Override
-                public void remove() {
-                    iterator.remove();
-                    if (current.primary()) {
-                        primaries--;
-                    }
-                    transactionId++;
+        /**
+         * Adds a shard to the ignore unassigned list. Should be used with caution, typically,
+         * the correct usage is to removeAndIgnore from the iterator.
+         */
+        public void ignoreShard(ShardRouting shard) {
+            ignored.add(shard);
+            transactionId++;
+        }
+
+        public class UnassignedIterator implements Iterator<ShardRouting> {
+
+            private final Iterator<ShardRouting> iterator;
+            private ShardRouting current;
+
+            public UnassignedIterator() {
+                this.iterator = unassigned.iterator();
+            }
+
+            @Override
+            public boolean hasNext() {
+                return iterator.hasNext();
+            }
+
+            @Override
+            public ShardRouting next() {
+                return current = iterator.next();
+            }
+
+            /**
+             * Initializes the current unassigned shard and moves it from the unassigned list.
+             */
+            public void initialize(String nodeId) {
+                initialize(nodeId, current.version());
+            }
+
+            /**
+             * Initializes the current unassigned shard and moves it from the unassigned list.
+             */
+            public void initialize(String nodeId, long version) {
+                innerRemove();
+                nodes.initialize(new ShardRouting(current, version), nodeId);
+            }
+
+            /**
+             * Removes and ignores the unassigned shard (will be ignored for this run, but
+             * will be added back to unassigned once the metadata is constructed again).
+             */
+            public void removeAndIgnore() {
+                innerRemove();
+                ignoreShard(current);
+            }
+
+            /**
+             * Unsupported operation, just there for the interface. Use {@link #removeAndIgnore()} or
+             * {@link #initialize(String)}.
+             */
+            @Override
+            public void remove() {
+                throw new UnsupportedOperationException("remove is not supported in unassigned iterator, use removeAndIgnore or initialize");
+            }
+
+            private void innerRemove() {
+                iterator.remove();
+                if (current.primary()) {
+                    primaries--;
                }
-            };
+                transactionId++;
+            }
        }

        public boolean isEmpty() {
@ -612,16 +663,19 @@ public class RoutingNodes implements Iterable<RoutingNode> {
        public void clear() {
            transactionId++;
            unassigned.clear();
+            ignored.clear();
            primaries = 0;
        }

        public void transactionEnd(UnassignedShards shards) {
-           assert shards.source == this && shards.sourceTransactionId == transactionId :
-                   "Expected ID: " + shards.sourceTransactionId + " actual: " + transactionId + " Expected Source: " + shards.source + " actual: " + this;
-           transactionId++;
-           this.unassigned.clear();
-           this.unassigned.addAll(shards.unassigned);
-           this.primaries = shards.primaries;
+            assert shards.source == this && shards.sourceTransactionId == transactionId :
+                    "Expected ID: " + shards.sourceTransactionId + " actual: " + transactionId + " Expected Source: " + shards.source + " actual: " + this;
+            transactionId++;
+            this.unassigned.clear();
+            this.unassigned.addAll(shards.unassigned);
+            this.ignored.clear();
+            this.ignored.addAll(shards.ignored);
+            this.primaries = shards.primaries;
        }

        public UnassignedShards transactionBegin() {
@ -757,6 +811,7 @@ public class RoutingNodes implements Iterable<RoutingNode> {
        private final RoutingNode iterable;
        private ShardRouting shard;
        private final Iterator<ShardRouting> delegate;
+        private boolean removed = false;

        public RoutingNodeIterator(RoutingNode iterable) {
            this.delegate = iterable.mutableIterator();
@ -770,6 +825,7 @@ public class RoutingNodes implements Iterable<RoutingNode> {

        @Override
        public ShardRouting next() {
+            removed = false;
            return shard = delegate.next();
        }

@ -777,6 +833,13 @@ public class RoutingNodes implements Iterable<RoutingNode> {
        public void remove() {
            delegate.remove();
            RoutingNodes.this.remove(shard);
+            removed = true;
+        }
+
+
+        /** returns true if {@link #remove()} or {@link #moveToUnassigned(UnassignedInfo)} were called on the current shard */
+        public boolean isRemoved() {
+            return removed;
        }

        @Override
@ -785,10 +848,16 @@ public class RoutingNodes implements Iterable<RoutingNode> {
        }

        public void moveToUnassigned(UnassignedInfo unassignedInfo) {
-            remove();
+            if (isRemoved() == false) {
+                remove();
+            }
            ShardRouting unassigned = new ShardRouting(shard); // protective copy of the mutable shard
            unassigned.moveToUnassigned(unassignedInfo);
            unassigned().add(unassigned);
        }
+
+        public ShardRouting current() {
+            return shard;
+        }
    }
 }
--- a/core/src/main/java/org/elasticsearch/cluster/routing/RoutingTable.java
+++ b/core/src/main/java/org/elasticsearch/cluster/routing/RoutingTable.java
@ -362,7 +362,7 @@ public class RoutingTable implements Iterable<IndexRoutingTable>, Diffable<Routi
                    indexBuilder.addShard(refData, shardRoutingEntry);
                }
            }
-            for (ShardRouting shardRoutingEntry : Iterables.concat(routingNodes.unassigned(), routingNodes.ignoredUnassigned())) {
+            for (ShardRouting shardRoutingEntry : Iterables.concat(routingNodes.unassigned(), routingNodes.unassigned().ignored())) {
                String index = shardRoutingEntry.index();
                IndexRoutingTable.Builder indexBuilder = indexRoutingTableBuilders.get(index);
                if (indexBuilder == null) {
--- a/core/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java
+++ b/core/src/main/java/org/elasticsearch/cluster/routing/ShardRouting.java
@ -441,11 +441,12 @@ public final class ShardRouting implements Streamable, ToXContent {
    void moveToStarted() {
        ensureNotFrozen();
        version++;
-        assert state == ShardRoutingState.INITIALIZING || state == ShardRoutingState.RELOCATING : this;
+        assert state == ShardRoutingState.INITIALIZING : "expected an initializing shard " + this;
        relocatingNodeId = null;
        restoreSource = null;
        unassignedInfo = null; // we keep the unassigned data until the shard is started
-        if (state == ShardRoutingState.RELOCATING) {
+        if (allocationId.getRelocationId() != null) {
+            // target relocation
            allocationId = AllocationId.finishRelocation(allocationId);
        }
        state = ShardRoutingState.STARTED;
@ -502,6 +503,9 @@ public final class ShardRouting implements Streamable, ToXContent {
        if (relocatingNodeId != null ? !relocatingNodeId.equals(that.relocatingNodeId) : that.relocatingNodeId != null) {
            return false;
        }
+        if (allocationId != null ? !allocationId.equals(that.allocationId) : that.allocationId != null) {
+            return false;
+        }
        if (state != that.state) {
            return false;
        }
@ -526,6 +530,7 @@ public final class ShardRouting implements Streamable, ToXContent {
        result = 31 * result + (primary ? 1 : 0);
        result = 31 * result + (state != null ? state.hashCode() : 0);
        result = 31 * result + (restoreSource != null ? restoreSource.hashCode() : 0);
+        result = 31 * result + (allocationId != null ? allocationId.hashCode() : 0);
        return hashCode = result;
    }

@ -549,10 +554,14 @@ public final class ShardRouting implements Streamable, ToXContent {
        } else {
            sb.append("[R]");
        }
+        sb.append(", v[").append(version).append("]");
        if (this.restoreSource != null) {
            sb.append(", restoring[" + restoreSource + "]");
        }
        sb.append(", s[").append(state).append("]");
+        if (allocationId != null) {
+            sb.append(", a").append(allocationId);
+        }
        if (this.unassignedInfo != null) {
            sb.append(", ").append(unassignedInfo.toString());
        }
@ -567,11 +576,16 @@ public final class ShardRouting implements Streamable, ToXContent {
                .field("node", currentNodeId())
                .field("relocating_node", relocatingNodeId())
                .field("shard", shardId().id())
-                .field("index", shardId().index().name());
+                .field("index", shardId().index().name())
+                .field("version", version);
+
        if (restoreSource() != null) {
            builder.field("restore_source");
            restoreSource().toXContent(builder, params);
        }
+        if (allocationId != null) {
+            allocationId.toXContent(builder, params);
+        }
        if (unassignedInfo != null) {
            unassignedInfo.toXContent(builder, params);
        }
--- a/core/src/main/java/org/elasticsearch/cluster/routing/UnassignedInfo.java
+++ b/core/src/main/java/org/elasticsearch/cluster/routing/UnassignedInfo.java
@ -257,19 +257,22 @@ public class UnassignedInfo implements ToXContent, Writeable<UnassignedInfo> {
        return nextDelay == Long.MAX_VALUE ? 0l : nextDelay;
    }

-    @Override
-    public String toString() {
+    public String shortSummary() {
        StringBuilder sb = new StringBuilder();
-        sb.append("unassigned_info[[reason=").append(reason).append("]");
+        sb.append("[reason=").append(reason).append("]");
        sb.append(", at[").append(DATE_TIME_FORMATTER.printer().print(timestamp)).append("]");
        String details = getDetails();
        if (details != null) {
            sb.append(", details[").append(details).append("]");
        }
-        sb.append("]");
        return sb.toString();
    }

+    @Override
+    public String toString() {
+        return "unassigned_info[" + shortSummary() + "]";
+    }
+
    @Override
    public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
        builder.startObject("unassigned_info");
--- a/core/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java
+++ b/core/src/main/java/org/elasticsearch/cluster/routing/allocation/AllocationService.java
@ -22,7 +22,6 @@ package org.elasticsearch.cluster.routing.allocation;
 import com.carrotsearch.hppc.cursors.ObjectCursor;
 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.Lists;
-import org.elasticsearch.ExceptionsHelper;
 import org.elasticsearch.cluster.ClusterInfoService;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
@ -325,47 +324,50 @@ public class AllocationService extends AbstractComponent {
        for (ShardRouting startedShard : startedShardEntries) {
            assert startedShard.initializing();

-            // retrieve the relocating node id before calling startedShard().
-            String relocatingNodeId = null;
+            // validate index still exists. strictly speaking this is not needed but it gives clearer logs
+            if (routingNodes.routingTable().index(startedShard.index()) == null) {
+                logger.debug("{} ignoring shard started, unknown index (routing: {})", startedShard.shardId(), startedShard);
+                continue;
+            }
+

            RoutingNodes.RoutingNodeIterator currentRoutingNode = routingNodes.routingNodeIter(startedShard.currentNodeId());
-            if (currentRoutingNode != null) {
-                for (ShardRouting shard : currentRoutingNode) {
-                    if (shard.shardId().equals(startedShard.shardId())) {
-                        if (shard.equals(startedShard)) {
-                            relocatingNodeId = shard.relocatingNodeId();
-                            dirty = true;
-                            routingNodes.started(shard);
-                            logger.trace("{} marked as started", shard);
-                        } else {
-                            logger.debug("failed to find shard [{}] in order to start it [no matching shard on node], ignoring", startedShard);
-                        }
-                        break;
-                    }
-                }
-            } else {
-                logger.debug("failed to find shard [{}] in order to start it [failed to find node], ignoring", startedShard);
+            if (currentRoutingNode == null) {
+                logger.debug("{} failed to find shard in order to start it [failed to find node], ignoring (routing: {})", startedShard.shardId(), startedShard);
+                continue;
+            }

+            for (ShardRouting shard : currentRoutingNode) {
+                if (shard.allocationId().getId().equals(startedShard.allocationId().getId())) {
+                    if (shard.active()) {
+                        logger.trace("{} shard is already started, ignoring (routing: {})", startedShard.shardId(), startedShard);
+                    } else {
+                        dirty = true;
+                        // override started shard with the latest copy. Capture it now , before starting the shard destroys it...
+                        startedShard = new ShardRouting(shard);
+                        routingNodes.started(shard);
+                        logger.trace("{} marked shard as started (routing: {})", startedShard.shardId(), startedShard);
+                    }
+                    break;
+                }
            }

            // startedShard is the current state of the shard (post relocation for example)
            // this means that after relocation, the state will be started and the currentNodeId will be
            // the node we relocated to
-
-            if (relocatingNodeId == null) {
+            if (startedShard.relocatingNodeId() == null) {
                continue;
            }

-            RoutingNodes.RoutingNodeIterator sourceRoutingNode = routingNodes.routingNodeIter(relocatingNodeId);
+            RoutingNodes.RoutingNodeIterator sourceRoutingNode = routingNodes.routingNodeIter(startedShard.relocatingNodeId());
            if (sourceRoutingNode != null) {
                while (sourceRoutingNode.hasNext()) {
                    ShardRouting shard = sourceRoutingNode.next();
-                    if (shard.shardId().equals(startedShard.shardId())) {
-                        if (shard.relocating()) {
-                            dirty = true;
-                            sourceRoutingNode.remove();
-                            break;
-                        }
+                    if (shard.allocationId().getId().equals(startedShard.allocationId().getRelocationId())) {
+                        assert shard.relocating() : "source shard for relocation is not marked as relocating. source " + shard + ", started target " + startedShard;
+                        dirty = true;
+                        sourceRoutingNode.remove();
+                        break;
                    }
                }
            }
@ -378,133 +380,89 @@ public class AllocationService extends AbstractComponent {
     * require relocation.
     */
    private boolean applyFailedShard(RoutingAllocation allocation, ShardRouting failedShard, boolean addToIgnoreList, UnassignedInfo unassignedInfo) {
-        // create a copy of the failed shard, since we assume we can change possible references to it without
-        // changing the state of failed shard
-        failedShard = new ShardRouting(failedShard);
-
        IndexRoutingTable indexRoutingTable = allocation.routingTable().index(failedShard.index());
        if (indexRoutingTable == null) {
+            logger.debug("{} ignoring shard failure, unknown index in {} ({})", failedShard.shardId(), failedShard, unassignedInfo.shortSummary());
            return false;
        }

        RoutingNodes routingNodes = allocation.routingNodes();
-        boolean dirty = false;
-        if (failedShard.relocatingNodeId() != null) {
-            // the shard is relocating, either in initializing (recovery from another node) or relocating (moving to another node)
-            if (failedShard.initializing()) {
-                // the shard is initializing and recovering from another node
-                // first, we need to cancel the current node that is being initialized
-                RoutingNodes.RoutingNodeIterator initializingNode = routingNodes.routingNodeIter(failedShard.currentNodeId());
-                if (initializingNode != null) {
-                    while (initializingNode.hasNext()) {
-                        ShardRouting shardRouting = initializingNode.next();
-                        if (shardRouting.equals(failedShard)) {
-                            dirty = true;
-                            initializingNode.remove();
-                            if (addToIgnoreList) {
-                                // make sure we ignore this shard on the relevant node
-                                allocation.addIgnoreShardForNode(failedShard.shardId(), failedShard.currentNodeId());
-                            }

-                            break;
-                        }
-                    }
-                }
-                if (dirty) {
-                    logger.debug("failed shard {} found in routingNodes, failing it", failedShard);
-                    // now, find the node that we are relocating *from*, and cancel its relocation
-                    RoutingNode relocatingFromNode = routingNodes.node(failedShard.relocatingNodeId());
-                    if (relocatingFromNode != null) {
-                        for (ShardRouting shardRouting : relocatingFromNode) {
-                            if (shardRouting.shardId().equals(failedShard.shardId()) && shardRouting.relocating()) {
-                                dirty = true;
-                                routingNodes.cancelRelocation(shardRouting);
-                                break;
-                            }
-                        }
-                    }
-                } else {
-                    logger.debug("failed shard {} not found in routingNodes, ignoring it", failedShard);
-                }
-                return dirty;
-            } else if (failedShard.relocating()) {
-                // the shard is relocating, meaning its the source the shard is relocating from
-                // first, we need to cancel the current relocation from the current node
-                // now, find the node that we are recovering from, cancel the relocation, remove it from the node
-                // and add it to the unassigned shards list...
-                RoutingNodes.RoutingNodeIterator relocatingFromNode = routingNodes.routingNodeIter(failedShard.currentNodeId());
-                if (relocatingFromNode != null) {
-                    while (relocatingFromNode.hasNext()) {
-                        ShardRouting shardRouting = relocatingFromNode.next();
-                        if (shardRouting.equals(failedShard)) {
-                            dirty = true;
-                            if (addToIgnoreList) {
-                                // make sure we ignore this shard on the relevant node
-                                allocation.addIgnoreShardForNode(failedShard.shardId(), failedShard.currentNodeId());
-                            }
-                            relocatingFromNode.moveToUnassigned(unassignedInfo);
-                            break;
-                        }
-                    }
-                }
-                if (dirty) {
-                    logger.debug("failed shard {} found in routingNodes, failing it", failedShard);
-                    // next, we need to find the target initializing shard that is recovering from, and remove it...
-                    RoutingNodes.RoutingNodeIterator initializingNode = routingNodes.routingNodeIter(failedShard.relocatingNodeId());
-                    if (initializingNode != null) {
-                        while (initializingNode.hasNext()) {
-                            ShardRouting shardRouting = initializingNode.next();
-                            if (shardRouting.shardId().equals(failedShard.shardId()) && shardRouting.initializing()) {
-                                dirty = true;
-                                initializingNode.remove();
-                            }
-                        }
-                    }
-                } else {
-                    logger.debug("failed shard {} not found in routingNodes, ignoring it", failedShard);
-                }
-            } else {
-                throw new IllegalStateException("illegal state for a failed shard, relocating node id is set, but state does not match: " + failedShard);
+        RoutingNodes.RoutingNodeIterator matchedNode = routingNodes.routingNodeIter(failedShard.currentNodeId());
+        if (matchedNode == null) {
+            logger.debug("{} ignoring shard failure, unknown node in {} ({})", failedShard.shardId(), failedShard, unassignedInfo.shortSummary());
+            return false;
+        }
+
+        boolean matchedShard = false;
+        while (matchedNode.hasNext()) {
+            ShardRouting routing = matchedNode.next();
+            if (routing.allocationId().getId().equals(failedShard.allocationId().getId())) {
+                matchedShard = true;
+                logger.debug("{} failed shard {} found in routingNodes, failing it ({})", failedShard.shardId(), failedShard, unassignedInfo.shortSummary());
+                break;
            }
-        } else {
-            // the shard is not relocating, its either started, or initializing, just cancel it and move on...
-            RoutingNodes.RoutingNodeIterator node = routingNodes.routingNodeIter(failedShard.currentNodeId());
-            if (node != null) {
-                while (node.hasNext()) {
-                    ShardRouting shardRouting = node.next();
-                    if (shardRouting.equals(failedShard)) {
-                        dirty = true;
-                        if (addToIgnoreList) {
-                            // make sure we ignore this shard on the relevant node
-                            allocation.addIgnoreShardForNode(failedShard.shardId(), failedShard.currentNodeId());
-                        }
-                        // move all the shards matching the failed shard to the end of the unassigned list
-                        // so we give a chance for other allocations and won't create poison failed allocations
-                        // that can keep other shards from being allocated (because of limits applied on how many
-                        // shards we can start per node)
-                        List<ShardRouting> shardsToMove = Lists.newArrayList();
-                        for (Iterator<ShardRouting> unassignedIt = routingNodes.unassigned().iterator(); unassignedIt.hasNext(); ) {
-                            ShardRouting unassignedShardRouting = unassignedIt.next();
-                            if (unassignedShardRouting.shardId().equals(failedShard.shardId())) {
-                                unassignedIt.remove();
-                                shardsToMove.add(unassignedShardRouting);
-                            }
-                        }
-                        if (!shardsToMove.isEmpty()) {
-                            routingNodes.unassigned().addAll(shardsToMove);
-                        }
+        }

-                        node.moveToUnassigned(unassignedInfo);
+        if (matchedShard == false) {
+            logger.debug("{} ignoring shard failure, unknown allocation id in {} ({})", failedShard.shardId(), failedShard, unassignedInfo.shortSummary());
+            return false;
+        }
+
+        // replace incoming instance to make sure we work on the latest one. Copy it to maintain information during modifications.
+        failedShard = new ShardRouting(matchedNode.current());
+
+        // remove the current copy of the shard
+        matchedNode.remove();
+
+        if (addToIgnoreList) {
+            // make sure we ignore this shard on the relevant node
+            allocation.addIgnoreShardForNode(failedShard.shardId(), failedShard.currentNodeId());
+        }
+
+        if (failedShard.relocatingNodeId() != null && failedShard.initializing()) {
+            // The shard is a target of a relocating shard. In that case we only
+            // need to remove the target shard and cancel the source relocation.
+            // No shard is left unassigned
+            logger.trace("{} is a relocation target, resolving source to cancel relocation ({})", failedShard, unassignedInfo.shortSummary());
+            RoutingNode relocatingFromNode = routingNodes.node(failedShard.relocatingNodeId());
+            if (relocatingFromNode != null) {
+                for (ShardRouting shardRouting : relocatingFromNode) {
+                    if (shardRouting.allocationId().getId().equals(failedShard.allocationId().getRelocationId())) {
+                        logger.trace("{}, resolved source to [{}]. canceling relocation ... ({})", failedShard.shardId(), shardRouting, unassignedInfo.shortSummary());
+                        routingNodes.cancelRelocation(shardRouting);
                        break;
                    }
                }
            }
-            if (dirty) {
-                logger.debug("failed shard {} found in routingNodes and failed", failedShard);
-            } else {
-                logger.debug("failed shard {} not found in routingNodes, ignoring it", failedShard);
+        } else {
+            // The fail shard is the main copy of the current shard routing. Any
+            // relocation will be cancelled (and the target shard removed as well)
+            // and the shard copy needs to be marked as unassigned
+
+            if (failedShard.relocatingNodeId() != null) {
+                // handle relocation source shards.  we need to find the target initializing shard that is recovering from, and remove it...
+                assert failedShard.initializing() == false; // should have been dealt with and returned
+                assert failedShard.relocating();
+
+                RoutingNodes.RoutingNodeIterator initializingNode = routingNodes.routingNodeIter(failedShard.relocatingNodeId());
+                if (initializingNode != null) {
+                    while (initializingNode.hasNext()) {
+                        ShardRouting shardRouting = initializingNode.next();
+                        if (shardRouting.allocationId().getId().equals(failedShard.allocationId().getRelocationId())) {
+                            assert shardRouting.initializing() : shardRouting;
+                            assert failedShard.allocationId().getId().equals(shardRouting.allocationId().getRelocationId())
+                                    : "found target shard's allocation relocation id is different than source";
+                            logger.trace("{} is removed due to the failure of the source shard", shardRouting);
+                            initializingNode.remove();
+                        }
+                    }
+                }
            }
+
+            matchedNode.moveToUnassigned(unassignedInfo);
        }
-        return dirty;
+        assert matchedNode.isRemoved() : "failedShard " + failedShard + " was matched but wasn't removed";
+        return true;
    }
 }
--- a/core/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java
+++ b/core/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java
@ -41,7 +41,6 @@ import org.elasticsearch.node.settings.NodeSettingsService;

 import java.util.*;

-import static org.elasticsearch.cluster.routing.ShardRoutingState.INITIALIZING;
 import static org.elasticsearch.cluster.routing.ShardRoutingState.RELOCATING;

 /**
@ -292,7 +291,7 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
            }
            indices.addAll(allocation.routingTable().indicesRouting().keySet());
            buildModelFromAssigned(routing.shards(assignedFilter));
-            return allocateUnassigned(unassigned, routing.ignoredUnassigned());
+            return allocateUnassigned(unassigned);
        }

        private static float absDelta(float lower, float higher) {
@ -551,7 +550,7 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
         * Allocates all given shards on the minimal eligable node for the shards index
         * with respect to the weight function. All given shards must be unassigned.
         */
-        private boolean allocateUnassigned(RoutingNodes.UnassignedShards unassigned, List<ShardRouting> ignoredUnassigned) {
+        private boolean allocateUnassigned(RoutingNodes.UnassignedShards unassigned) {
            assert !nodes.isEmpty();
            if (logger.isTraceEnabled()) {
                logger.trace("Start allocating unassigned shards");
@ -600,9 +599,9 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
                    if (!shard.primary()) {
                        boolean drop = deciders.canAllocate(shard, allocation).type() == Type.NO;
                        if (drop) {
-                            ignoredUnassigned.add(shard);
+                            unassigned.ignoreShard(shard);
                            while(i < primaryLength-1 && comparator.compare(primary[i], primary[i+1]) == 0) {
-                                ignoredUnassigned.add(primary[++i]);
+                                unassigned.ignoreShard(primary[++i]);
                            }
                            continue;
                        } else {
@ -706,10 +705,10 @@ public class BalancedShardsAllocator extends AbstractComponent implements Shards
                    } else if (logger.isTraceEnabled()) {
                        logger.trace("No Node found to assign shard [{}]", shard);
                    }
-                    ignoredUnassigned.add(shard);
+                    unassigned.ignoreShard(shard);
                    if (!shard.primary()) { // we could not allocate it and we are a replica - check if we can ignore the other replicas
                        while(secondaryLength > 0 && comparator.compare(shard, secondary[secondaryLength-1]) == 0) {
-                            ignoredUnassigned.add(secondary[--secondaryLength]);
+                            unassigned.ignoreShard(secondary[--secondaryLength]);
                        }
                    }
                }
--- a/core/src/main/java/org/elasticsearch/cluster/routing/allocation/command/AllocateAllocationCommand.java
+++ b/core/src/main/java/org/elasticsearch/cluster/routing/allocation/command/AllocateAllocationCommand.java
@ -220,12 +220,11 @@ public class AllocateAllocationCommand implements AllocationCommand {
            throw new IllegalArgumentException("[allocate] allocation of " + shardId + " on node " + discoNode + " is not allowed, reason: " + decision);
        }
        // go over and remove it from the unassigned
-        for (Iterator<ShardRouting> it = routingNodes.unassigned().iterator(); it.hasNext(); ) {
+        for (RoutingNodes.UnassignedShards.UnassignedIterator it = routingNodes.unassigned().iterator(); it.hasNext(); ) {
            if (it.next() != shardRouting) {
                continue;
            }
-            it.remove();
-            routingNodes.initialize(shardRouting, routingNode.nodeId());
+            it.initialize(routingNode.nodeId());
            if (shardRouting.primary()) {
                // we need to clear the post allocation flag, since its an explicit allocation of the primary shard
                // and we want to force allocate it (and create a new index for it)
--- a/core/src/main/java/org/elasticsearch/common/util/concurrent/PrioritizedEsThreadPoolExecutor.java
+++ b/core/src/main/java/org/elasticsearch/common/util/concurrent/PrioritizedEsThreadPoolExecutor.java
@ -105,6 +105,7 @@ public class PrioritizedEsThreadPoolExecutor extends EsThreadPoolExecutor {
        } else if (!(command instanceof PrioritizedFutureTask)) { // it might be a callable wrapper...
            command = new TieBreakingPrioritizedRunnable(command, Priority.NORMAL, insertionOrder.incrementAndGet());
        }
+        super.execute(command);
        if (timeout.nanos() >= 0) {
            if (command instanceof TieBreakingPrioritizedRunnable) {
                ((TieBreakingPrioritizedRunnable) command).scheduleTimeout(timer, timeoutCallback, timeout);
@ -114,7 +115,6 @@ public class PrioritizedEsThreadPoolExecutor extends EsThreadPoolExecutor {
                throw new UnsupportedOperationException("Execute with timeout is not supported for future tasks");
            }
        }
-        super.execute(command);
    }

    @Override
@ -161,7 +161,8 @@ public class PrioritizedEsThreadPoolExecutor extends EsThreadPoolExecutor {

        private Runnable runnable;
        private final long insertionOrder;
-        private ScheduledFuture<?> timeoutFuture;
+        private volatile ScheduledFuture<?> timeoutFuture;
+        private volatile boolean started = false;

        TieBreakingPrioritizedRunnable(PrioritizedRunnable runnable, long insertionOrder) {
            this(runnable, runnable.priority(), insertionOrder);
@ -175,6 +176,9 @@ public class PrioritizedEsThreadPoolExecutor extends EsThreadPoolExecutor {

        @Override
        public void run() {
+            // make the task as stared. This is needed for synchronization with the timeout handling
+            // see  #scheduleTimeout()
+            started = true;
            FutureUtils.cancel(timeoutFuture);
            runAndClean(runnable);
        }
@ -197,6 +201,10 @@ public class PrioritizedEsThreadPoolExecutor extends EsThreadPoolExecutor {
                    }
                }
            }, timeValue.nanos(), TimeUnit.NANOSECONDS);
+            if (started) {
+                // if the actual action already it might have missed the setting of the future. Clean it ourselves.
+                FutureUtils.cancel(timeoutFuture);
+            }
        }

        /**
--- a/core/src/main/java/org/elasticsearch/gateway/GatewayAllocator.java
+++ b/core/src/main/java/org/elasticsearch/gateway/GatewayAllocator.java
@ -19,41 +19,28 @@

 package org.elasticsearch.gateway;

-import com.carrotsearch.hppc.ObjectLongHashMap;
-import com.carrotsearch.hppc.cursors.ObjectCursor;
-import com.google.common.collect.Lists;
-import com.google.common.collect.Maps;
-import com.google.common.collect.Sets;
-import org.apache.lucene.util.CollectionUtil;
 import org.elasticsearch.action.support.nodes.BaseNodeResponse;
 import org.elasticsearch.action.support.nodes.BaseNodesResponse;
-import org.elasticsearch.cluster.*;
+import org.elasticsearch.cluster.ClusterChangedEvent;
+import org.elasticsearch.cluster.ClusterService;
+import org.elasticsearch.cluster.ClusterStateListener;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
-import org.elasticsearch.cluster.metadata.MetaData;
 import org.elasticsearch.cluster.node.DiscoveryNode;
-import org.elasticsearch.cluster.node.DiscoveryNodes;
-import org.elasticsearch.cluster.routing.*;
-import org.elasticsearch.cluster.routing.RoutingNode;
 import org.elasticsearch.cluster.routing.RoutingNodes;
+import org.elasticsearch.cluster.routing.RoutingService;
 import org.elasticsearch.cluster.routing.ShardRouting;
 import org.elasticsearch.cluster.routing.allocation.FailedRerouteAllocation;
 import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
 import org.elasticsearch.cluster.routing.allocation.StartedRerouteAllocation;
-import org.elasticsearch.cluster.routing.allocation.decider.Decision;
 import org.elasticsearch.common.component.AbstractComponent;
 import org.elasticsearch.common.inject.Inject;
 import org.elasticsearch.common.lease.Releasables;
 import org.elasticsearch.common.logging.ESLogger;
 import org.elasticsearch.common.settings.Settings;
-import org.elasticsearch.common.unit.ByteSizeValue;
-import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
-import org.elasticsearch.index.settings.IndexSettings;
 import org.elasticsearch.index.shard.ShardId;
-import org.elasticsearch.index.store.StoreFileMetaData;
 import org.elasticsearch.indices.store.TransportNodesListShardStoreMetaData;

-import java.util.*;
 import java.util.concurrent.ConcurrentMap;

 /**
@ -61,26 +48,19 @@ import java.util.concurrent.ConcurrentMap;
 */
 public class GatewayAllocator extends AbstractComponent {

-    public static final String INDEX_RECOVERY_INITIAL_SHARDS = "index.recovery.initial_shards";
-
-    private final String initialShards;
-
-    private final TransportNodesListGatewayStartedShards startedAction;
-    private final TransportNodesListShardStoreMetaData storeAction;
    private RoutingService routingService;

+    private final PrimaryShardAllocator primaryShardAllocator;
+    private final ReplicaShardAllocator replicaShardAllocator;
+
    private final ConcurrentMap<ShardId, AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards>> asyncFetchStarted = ConcurrentCollections.newConcurrentMap();
    private final ConcurrentMap<ShardId, AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData>> asyncFetchStore = ConcurrentCollections.newConcurrentMap();

    @Inject
-    public GatewayAllocator(Settings settings, TransportNodesListGatewayStartedShards startedAction, TransportNodesListShardStoreMetaData storeAction) {
+    public GatewayAllocator(Settings settings, final TransportNodesListGatewayStartedShards startedAction, final TransportNodesListShardStoreMetaData storeAction) {
        super(settings);
-        this.startedAction = startedAction;
-        this.storeAction = storeAction;
-
-        this.initialShards = settings.get("gateway.initial_shards", settings.get("gateway.local.initial_shards", "quorum"));
-
-        logger.debug("using initial_shards [{}]", initialShards);
+        this.primaryShardAllocator = new InternalPrimaryShardAllocator(settings, startedAction);
+        this.replicaShardAllocator = new InternalReplicaShardAllocator(settings, storeAction);
    }

    public void setReallocation(final ClusterService clusterService, final RoutingService routingService) {
@ -132,416 +112,21 @@ public class GatewayAllocator extends AbstractComponent {
        }
    }

-    /**
-     * Return {@code true} if the index is configured to allow shards to be
-     * recovered on any node
-     */
-    private boolean recoverOnAnyNode(@IndexSettings Settings idxSettings) {
-        return IndexMetaData.isOnSharedFilesystem(idxSettings) &&
-                idxSettings.getAsBoolean(IndexMetaData.SETTING_SHARED_FS_ALLOW_RECOVERY_ON_ANY_NODE, false);
-    }
-
-    public boolean allocateUnassigned(RoutingAllocation allocation) {
+    public boolean allocateUnassigned(final RoutingAllocation allocation) {
        boolean changed = false;
-        DiscoveryNodes nodes = allocation.nodes();
-        RoutingNodes routingNodes = allocation.routingNodes();

-        // First, handle primaries, they must find a place to be allocated on here
-        final MetaData metaData = routingNodes.metaData();
-        RoutingNodes.UnassignedShards unassigned = routingNodes.unassigned();
+        RoutingNodes.UnassignedShards unassigned = allocation.routingNodes().unassigned();
        unassigned.sort(new PriorityComparator() {

            @Override
            protected Settings getIndexSettings(String index) {
-                IndexMetaData indexMetaData = metaData.index(index);
+                IndexMetaData indexMetaData = allocation.metaData().index(index);
                return indexMetaData.getSettings();
            }
        }); // sort for priority ordering
-        Iterator<ShardRouting> unassignedIterator = unassigned.iterator();
-        while (unassignedIterator.hasNext()) {
-            ShardRouting shard = unassignedIterator.next();

-            if (!shard.primary()) {
-                continue;
-            }
-
-            // this is an API allocation, ignore since we know there is no data...
-            if (!routingNodes.routingTable().index(shard.index()).shard(shard.id()).primaryAllocatedPostApi()) {
-                continue;
-            }
-
-            AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetch = asyncFetchStarted.get(shard.shardId());
-            if (fetch == null) {
-                fetch = new InternalAsyncFetch<>(logger, "shard_started", shard.shardId(), startedAction);
-                asyncFetchStarted.put(shard.shardId(), fetch);
-            }
-            AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> shardState = fetch.fetchData(nodes, metaData, allocation.getIgnoreNodes(shard.shardId()));
-            if (shardState.hasData() == false) {
-                logger.trace("{}: ignoring allocation, still fetching shard started state", shard);
-                unassignedIterator.remove();
-                routingNodes.ignoredUnassigned().add(shard);
-                continue;
-            }
-            shardState.processAllocation(allocation);
-
-            IndexMetaData indexMetaData = metaData.index(shard.getIndex());
-
-            /**
-             * Build a map of DiscoveryNodes to shard state number for the given shard.
-             * A state of -1 means the shard does not exist on the node, where any
-             * shard state >= 0 is the state version of the shard on that node's disk.
-             *
-             * A shard on shared storage will return at least shard state 0 for all
-             * nodes, indicating that the shard can be allocated to any node.
-             */
-            ObjectLongHashMap<DiscoveryNode> nodesState = new ObjectLongHashMap<>();
-            for (TransportNodesListGatewayStartedShards.NodeGatewayStartedShards nodeShardState : shardState.getData().values()) {
-                long version = nodeShardState.version();
-                // -1 version means it does not exists, which is what the API returns, and what we expect to
-                if (nodeShardState.storeException() == null) {
-                    logger.trace("[{}] on node [{}] has version [{}] of shard", shard, nodeShardState.getNode(), version);
-                    nodesState.put(nodeShardState.getNode(), version);
-                } else {
-                    // when there is an store exception, we disregard the reported version and assign it as -1 (same as shard does not exist)
-                    logger.trace("[{}] on node [{}] has version [{}] but the store can not be opened, treating as version -1", nodeShardState.storeException(), shard, nodeShardState.getNode(), version);
-                    nodesState.put(nodeShardState.getNode(), -1);
-                }
-            }
-
-            int numberOfAllocationsFound = 0;
-            long highestVersion = -1;
-            final Map<DiscoveryNode, Long> nodesWithVersion = Maps.newHashMap();
-
-            assert !nodesState.containsKey(null);
-            final Object[] keys = nodesState.keys;
-            final long[] values = nodesState.values;
-            Settings idxSettings = indexMetaData.settings();
-            for (int i = 0; i < keys.length; i++) {
-                if (keys[i] == null) {
-                    continue;
-                }
-
-                DiscoveryNode node = (DiscoveryNode) keys[i];
-                long version = values[i];
-                // since we don't check in NO allocation, we need to double check here
-                if (allocation.shouldIgnoreShardForNode(shard.shardId(), node.id())) {
-                    continue;
-                }
-                if (recoverOnAnyNode(idxSettings)) {
-                    numberOfAllocationsFound++;
-                    if (version > highestVersion) {
-                        highestVersion = version;
-                    }
-                    // We always put the node without clearing the map
-                    nodesWithVersion.put(node, version);
-                } else if (version != -1) {
-                    numberOfAllocationsFound++;
-                    // If we've found a new "best" candidate, clear the
-                    // current candidates and add it
-                    if (version > highestVersion) {
-                        highestVersion = version;
-                        nodesWithVersion.clear();
-                        nodesWithVersion.put(node, version);
-                    } else if (version == highestVersion) {
-                        // If the candidate is the same, add it to the
-                        // list, but keep the current candidate
-                        nodesWithVersion.put(node, version);
-                    }
-                }
-            }
-            // Now that we have a map of nodes to versions along with the
-            // number of allocations found (and not ignored), we need to sort
-            // it so the node with the highest version is at the beginning
-            List<DiscoveryNode> nodesWithHighestVersion = Lists.newArrayList();
-            nodesWithHighestVersion.addAll(nodesWithVersion.keySet());
-            CollectionUtil.timSort(nodesWithHighestVersion, new Comparator<DiscoveryNode>() {
-                @Override
-                public int compare(DiscoveryNode o1, DiscoveryNode o2) {
-                    return Long.compare(nodesWithVersion.get(o2), nodesWithVersion.get(o1));
-                }
-            });
-
-            if (logger.isDebugEnabled()) {
-                logger.debug("[{}][{}] found {} allocations of {}, highest version: [{}]",
-                        shard.index(), shard.id(), numberOfAllocationsFound, shard, highestVersion);
-            }
-            if (logger.isTraceEnabled()) {
-                StringBuilder sb = new StringBuilder("[");
-                for (DiscoveryNode n : nodesWithHighestVersion) {
-                    sb.append("[");
-                    sb.append(n.getName());
-                    sb.append("]");
-                    sb.append(" -> ");
-                    sb.append(nodesWithVersion.get(n));
-                    sb.append(", ");
-                }
-                sb.append("]");
-                logger.trace("{} candidates for allocation: {}", shard, sb.toString());
-            }
-
-            // check if the counts meets the minimum set
-            int requiredAllocation = 1;
-            // if we restore from a repository one copy is more then enough
-            if (shard.restoreSource() == null) {
-                try {
-                    String initialShards = indexMetaData.settings().get(INDEX_RECOVERY_INITIAL_SHARDS, settings.get(INDEX_RECOVERY_INITIAL_SHARDS, this.initialShards));
-                    if ("quorum".equals(initialShards)) {
-                        if (indexMetaData.numberOfReplicas() > 1) {
-                            requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1;
-                        }
-                    } else if ("quorum-1".equals(initialShards) || "half".equals(initialShards)) {
-                        if (indexMetaData.numberOfReplicas() > 2) {
-                            requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2);
-                        }
-                    } else if ("one".equals(initialShards)) {
-                        requiredAllocation = 1;
-                    } else if ("full".equals(initialShards) || "all".equals(initialShards)) {
-                        requiredAllocation = indexMetaData.numberOfReplicas() + 1;
-                    } else if ("full-1".equals(initialShards) || "all-1".equals(initialShards)) {
-                        if (indexMetaData.numberOfReplicas() > 1) {
-                            requiredAllocation = indexMetaData.numberOfReplicas();
-                        }
-                    } else {
-                        requiredAllocation = Integer.parseInt(initialShards);
-                    }
-                } catch (Exception e) {
-                    logger.warn("[{}][{}] failed to derived initial_shards from value {}, ignore allocation for {}", shard.index(), shard.id(), initialShards, shard);
-                }
-            }
-
-            // not enough found for this shard, continue...
-            if (numberOfAllocationsFound < requiredAllocation) {
-                // if we are restoring this shard we still can allocate
-                if (shard.restoreSource() == null) {
-                    // we can't really allocate, so ignore it and continue
-                    unassignedIterator.remove();
-                    routingNodes.ignoredUnassigned().add(shard);
-                    if (logger.isDebugEnabled()) {
-                        logger.debug("[{}][{}]: not allocating, number_of_allocated_shards_found [{}], required_number [{}]", shard.index(), shard.id(), numberOfAllocationsFound, requiredAllocation);
-                    }
-                } else if (logger.isDebugEnabled()) {
-                    logger.debug("[{}][{}]: missing local data, will restore from [{}]", shard.index(), shard.id(), shard.restoreSource());
-                }
-                continue;
-            }
-
-            Set<DiscoveryNode> throttledNodes = Sets.newHashSet();
-            Set<DiscoveryNode> noNodes = Sets.newHashSet();
-            for (DiscoveryNode discoNode : nodesWithHighestVersion) {
-                RoutingNode node = routingNodes.node(discoNode.id());
-                if (node == null) {
-                    continue;
-                }
-
-                Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
-                if (decision.type() == Decision.Type.THROTTLE) {
-                    throttledNodes.add(discoNode);
-                } else if (decision.type() == Decision.Type.NO) {
-                    noNodes.add(discoNode);
-                } else {
-                    if (logger.isDebugEnabled()) {
-                        logger.debug("[{}][{}]: allocating [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, discoNode);
-                    }
-                    // we found a match
-                    changed = true;
-                    // make sure we create one with the version from the recovered state
-                    routingNodes.initialize(new ShardRouting(shard, highestVersion), node.nodeId());
-                    unassignedIterator.remove();
-
-                    // found a node, so no throttling, no "no", and break out of the loop
-                    throttledNodes.clear();
-                    noNodes.clear();
-                    break;
-                }
-            }
-            if (throttledNodes.isEmpty()) {
-                // if we have a node that we "can't" allocate to, force allocation, since this is our master data!
-                if (!noNodes.isEmpty()) {
-                    DiscoveryNode discoNode = noNodes.iterator().next();
-                    RoutingNode node = routingNodes.node(discoNode.id());
-                    if (logger.isDebugEnabled()) {
-                        logger.debug("[{}][{}]: forcing allocating [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, discoNode);
-                    }
-                    // we found a match
-                    changed = true;
-                    // make sure we create one with the version from the recovered state
-                    routingNodes.initialize(new ShardRouting(shard, highestVersion), node.nodeId());
-                    unassignedIterator.remove();
-                }
-            } else {
-                if (logger.isDebugEnabled()) {
-                    logger.debug("[{}][{}]: throttling allocation [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, throttledNodes);
-                }
-                // we are throttling this, but we have enough to allocate to this node, ignore it for now
-                unassignedIterator.remove();
-                routingNodes.ignoredUnassigned().add(shard);
-            }
-        }
-
-        if (!routingNodes.hasUnassigned()) {
-            return changed;
-        }
-
-        // Now, handle replicas, try to assign them to nodes that are similar to the one the primary was allocated on
-        unassignedIterator = unassigned.iterator();
-        while (unassignedIterator.hasNext()) {
-            ShardRouting shard = unassignedIterator.next();
-            if (shard.primary()) {
-                continue;
-            }
-
-            // pre-check if it can be allocated to any node that currently exists, so we won't list the store for it for nothing
-            boolean canBeAllocatedToAtLeastOneNode = false;
-            for (ObjectCursor<DiscoveryNode> cursor : nodes.dataNodes().values()) {
-                RoutingNode node = routingNodes.node(cursor.value.id());
-                if (node == null) {
-                    continue;
-                }
-                // if we can't allocate it on a node, ignore it, for example, this handles
-                // cases for only allocating a replica after a primary
-                Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
-                if (decision.type() == Decision.Type.YES) {
-                    canBeAllocatedToAtLeastOneNode = true;
-                    break;
-                }
-            }
-
-            if (!canBeAllocatedToAtLeastOneNode) {
-                logger.trace("{}: ignoring allocation, can't be allocated on any node", shard);
-                unassignedIterator.remove();
-                routingNodes.ignoredUnassigned().add(shard);
-                continue;
-            }
-
-            AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetch = asyncFetchStore.get(shard.shardId());
-            if (fetch == null) {
-                fetch = new InternalAsyncFetch<>(logger, "shard_store", shard.shardId(), storeAction);
-                asyncFetchStore.put(shard.shardId(), fetch);
-            }
-            AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> shardStores = fetch.fetchData(nodes, metaData, allocation.getIgnoreNodes(shard.shardId()));
-            if (shardStores.hasData() == false) {
-                logger.trace("{}: ignoring allocation, still fetching shard stores", shard);
-                unassignedIterator.remove();
-                routingNodes.ignoredUnassigned().add(shard);
-                continue; // still fetching
-            }
-            shardStores.processAllocation(allocation);
-
-            long lastSizeMatched = 0;
-            DiscoveryNode lastDiscoNodeMatched = null;
-            RoutingNode lastNodeMatched = null;
-            boolean hasReplicaData = false;
-            IndexMetaData indexMetaData = metaData.index(shard.getIndex());
-
-            for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> nodeStoreEntry : shardStores.getData().entrySet()) {
-                DiscoveryNode discoNode = nodeStoreEntry.getKey();
-                TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData = nodeStoreEntry.getValue().storeFilesMetaData();
-                logger.trace("{}: checking node [{}]", shard, discoNode);
-
-                if (storeFilesMetaData == null) {
-                    // already allocated on that node...
-                    continue;
-                }
-
-                RoutingNode node = routingNodes.node(discoNode.id());
-                if (node == null) {
-                    continue;
-                }
-
-                // check if we can allocate on that node...
-                // we only check for NO, since if this node is THROTTLING and it has enough "same data"
-                // then we will try and assign it next time
-                Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
-                if (decision.type() == Decision.Type.NO) {
-                    continue;
-                }
-
-                // if it is already allocated, we can't assign to it...
-                if (storeFilesMetaData.allocated()) {
-                    continue;
-                }
-
-                if (!shard.primary()) {
-                    hasReplicaData |= storeFilesMetaData.iterator().hasNext();
-                    ShardRouting primaryShard = routingNodes.activePrimary(shard);
-                    if (primaryShard != null) {
-                        assert primaryShard.active();
-                        DiscoveryNode primaryNode = nodes.get(primaryShard.currentNodeId());
-                        if (primaryNode != null) {
-                            TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData primaryNodeFilesStore = shardStores.getData().get(primaryNode);
-                            if (primaryNodeFilesStore != null) {
-                                TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore = primaryNodeFilesStore.storeFilesMetaData();
-                                if (primaryNodeStore != null && primaryNodeStore.allocated()) {
-                                    long sizeMatched = 0;
-
-                                    String primarySyncId = primaryNodeStore.syncId();
-                                    String replicaSyncId = storeFilesMetaData.syncId();
-                                    // see if we have a sync id we can make use of
-                                    if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) {
-                                        logger.trace("{}: node [{}] has same sync id {} as primary", shard, discoNode.name(), replicaSyncId);
-                                        lastNodeMatched = node;
-                                        lastSizeMatched = Long.MAX_VALUE;
-                                        lastDiscoNodeMatched = discoNode;
-                                    } else {
-                                        for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) {
-                                            String metaDataFileName = storeFileMetaData.name();
-                                            if (primaryNodeStore.fileExists(metaDataFileName) && primaryNodeStore.file(metaDataFileName).isSame(storeFileMetaData)) {
-                                                sizeMatched += storeFileMetaData.length();
-                                            }
-                                        }
-                                        logger.trace("{}: node [{}] has [{}/{}] bytes of re-usable data",
-                                                shard, discoNode.name(), new ByteSizeValue(sizeMatched), sizeMatched);
-                                        if (sizeMatched > lastSizeMatched) {
-                                            lastSizeMatched = sizeMatched;
-                                            lastDiscoNodeMatched = discoNode;
-                                            lastNodeMatched = node;
-                                        }
-                                    }
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-
-            if (lastNodeMatched != null) {
-                // we only check on THROTTLE since we checked before before on NO
-                Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation);
-                if (decision.type() == Decision.Type.THROTTLE) {
-                    if (logger.isDebugEnabled()) {
-                        logger.debug("[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched));
-                    }
-                    // we are throttling this, but we have enough to allocate to this node, ignore it for now
-                    unassignedIterator.remove();
-                    routingNodes.ignoredUnassigned().add(shard);
-                } else {
-                    if (logger.isDebugEnabled()) {
-                        logger.debug("[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched));
-                    }
-                    // we found a match
-                    changed = true;
-                    routingNodes.initialize(shard, lastNodeMatched.nodeId());
-                    unassignedIterator.remove();
-                }
-            } else if (hasReplicaData == false) {
-                // if we didn't manage to find *any* data (regardless of matching sizes), check if the allocation
-                // of the replica shard needs to be delayed, and if so, add it to the ignore unassigned list
-                // note: we only care about replica in delayed allocation, since if we have an unassigned primary it
-                //       will anyhow wait to find an existing copy of the shard to be allocated
-                // note: the other side of the equation is scheduling a reroute in a timely manner, which happens in the RoutingService
-                long delay = shard.unassignedInfo().getDelayAllocationExpirationIn(settings, indexMetaData.getSettings());
-                if (delay > 0) {
-                    logger.debug("[{}][{}]: delaying allocation of [{}] for [{}]", shard.index(), shard.id(), shard, TimeValue.timeValueMillis(delay));
-                    /**
-                     * mark it as changed, since we want to kick a publishing to schedule future allocation,
-                     * see {@link org.elasticsearch.cluster.routing.RoutingService#clusterChanged(ClusterChangedEvent)}).
-                     */
-                    changed = true;
-                    unassignedIterator.remove();
-                    routingNodes.ignoredUnassigned().add(shard);
-                }
-            }
-        }
+        changed |= primaryShardAllocator.allocateUnassigned(allocation);
+        changed |= replicaShardAllocator.allocateUnassigned(allocation);
        return changed;
    }

@ -558,4 +143,54 @@ public class GatewayAllocator extends AbstractComponent {
        }
    }

+    class InternalPrimaryShardAllocator extends PrimaryShardAllocator {
+
+        private final TransportNodesListGatewayStartedShards startedAction;
+
+        public InternalPrimaryShardAllocator(Settings settings, TransportNodesListGatewayStartedShards startedAction) {
+            super(settings);
+            this.startedAction = startedAction;
+        }
+
+        @Override
+        protected AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetchData(ShardRouting shard, RoutingAllocation allocation) {
+            AsyncShardFetch<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetch = asyncFetchStarted.get(shard.shardId());
+            if (fetch == null) {
+                fetch = new InternalAsyncFetch<>(logger, "shard_started", shard.shardId(), startedAction);
+                asyncFetchStarted.put(shard.shardId(), fetch);
+            }
+            AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> shardState =
+                    fetch.fetchData(allocation.nodes(), allocation.metaData(), allocation.getIgnoreNodes(shard.shardId()));
+
+            if (shardState.hasData() == true) {
+                shardState.processAllocation(allocation);
+            }
+            return shardState;
+        }
+    }
+
+    class InternalReplicaShardAllocator extends ReplicaShardAllocator {
+
+        private final TransportNodesListShardStoreMetaData storeAction;
+
+        public InternalReplicaShardAllocator(Settings settings, TransportNodesListShardStoreMetaData storeAction) {
+            super(settings);
+            this.storeAction = storeAction;
+        }
+
+        @Override
+        protected AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetchData(ShardRouting shard, RoutingAllocation allocation) {
+            AsyncShardFetch<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetch = asyncFetchStore.get(shard.shardId());
+            if (fetch == null) {
+                fetch = new InternalAsyncFetch<>(logger, "shard_store", shard.shardId(), storeAction);
+                asyncFetchStore.put(shard.shardId(), fetch);
+            }
+            AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> shardStores =
+                    fetch.fetchData(allocation.nodes(), allocation.metaData(), allocation.getIgnoreNodes(shard.shardId()));
+            if (shardStores.hasData() == true) {
+                shardStores.processAllocation(allocation);
+            }
+            return shardStores;
+        }
+    }
 }
--- a/core/src/main/java/org/elasticsearch/gateway/PrimaryShardAllocator.java
+++ b/core/src/main/java/org/elasticsearch/gateway/PrimaryShardAllocator.java
@ -0,0 +1,293 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.gateway;
+
+import com.google.common.collect.Lists;
+import com.google.common.collect.Maps;
+import org.apache.lucene.util.CollectionUtil;
+import org.elasticsearch.cluster.metadata.IndexMetaData;
+import org.elasticsearch.cluster.metadata.MetaData;
+import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
+import org.elasticsearch.cluster.routing.RoutingNode;
+import org.elasticsearch.cluster.routing.RoutingNodes;
+import org.elasticsearch.cluster.routing.ShardRouting;
+import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
+import org.elasticsearch.cluster.routing.allocation.decider.Decision;
+import org.elasticsearch.common.component.AbstractComponent;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.settings.IndexSettings;
+
+import java.util.*;
+
+/**
+ * The primary shard allocator allocates primary shard that were not created as
+ * a result of an API to a node that held them last to be recovered.
+ */
+public abstract class PrimaryShardAllocator extends AbstractComponent {
+
+    public static final String INDEX_RECOVERY_INITIAL_SHARDS = "index.recovery.initial_shards";
+
+    private final String initialShards;
+
+    public PrimaryShardAllocator(Settings settings) {
+        super(settings);
+        this.initialShards = settings.get("gateway.initial_shards", settings.get("gateway.local.initial_shards", "quorum"));
+        logger.debug("using initial_shards [{}]", initialShards);
+    }
+
+    public boolean allocateUnassigned(RoutingAllocation allocation) {
+        boolean changed = false;
+        final RoutingNodes routingNodes = allocation.routingNodes();
+        final MetaData metaData = routingNodes.metaData();
+
+        final RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator = routingNodes.unassigned().iterator();
+        while (unassignedIterator.hasNext()) {
+            ShardRouting shard = unassignedIterator.next();
+
+            if (needToFindPrimaryCopy(shard, routingNodes.routingTable().index(shard.index()).shard(shard.id())) == false) {
+                continue;
+            }
+
+            AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> shardState = fetchData(shard, allocation);
+            if (shardState.hasData() == false) {
+                logger.trace("{}: ignoring allocation, still fetching shard started state", shard);
+                unassignedIterator.removeAndIgnore();
+                continue;
+            }
+
+            IndexMetaData indexMetaData = metaData.index(shard.getIndex());
+
+            NodesAndVersions nodesAndVersions = buildNodesAndVersions(shard, recoverOnAnyNode(indexMetaData.settings()), allocation.getIgnoreNodes(shard.shardId()), shardState);
+            logger.debug("[{}][{}] found {} allocations of {}, highest version: [{}]", shard.index(), shard.id(), nodesAndVersions.allocationsFound, shard, nodesAndVersions.highestVersion);
+
+            if (isEnoughAllocationsFound(shard, indexMetaData, nodesAndVersions) == false) {
+                // if we are restoring this shard we still can allocate
+                if (shard.restoreSource() == null) {
+                    // we can't really allocate, so ignore it and continue
+                    unassignedIterator.removeAndIgnore();
+                    logger.debug("[{}][{}]: not allocating, number_of_allocated_shards_found [{}]", shard.index(), shard.id(), nodesAndVersions.allocationsFound);
+                } else {
+                    logger.debug("[{}][{}]: missing local data, will restore from [{}]", shard.index(), shard.id(), shard.restoreSource());
+                }
+                continue;
+            }
+
+            NodesToAllocate nodesToAllocate = buildNodesToAllocate(shard, allocation, nodesAndVersions);
+            if (nodesToAllocate.yesNodes.isEmpty() == false) {
+                DiscoveryNode node = nodesToAllocate.yesNodes.get(0);
+                logger.debug("[{}][{}]: allocating [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, node);
+                changed = true;
+                unassignedIterator.initialize(node.id(), nodesAndVersions.highestVersion);
+            } else if (nodesToAllocate.throttleNodes.isEmpty() == true && nodesToAllocate.noNodes.isEmpty() == false) {
+                DiscoveryNode node = nodesToAllocate.noNodes.get(0);
+                logger.debug("[{}][{}]: forcing allocating [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, node);
+                changed = true;
+                unassignedIterator.initialize(node.id(), nodesAndVersions.highestVersion);
+            } else {
+                // we are throttling this, but we have enough to allocate to this node, ignore it for now
+                logger.debug("[{}][{}]: throttling allocation [{}] to [{}] on primary allocation", shard.index(), shard.id(), shard, nodesToAllocate.throttleNodes);
+                unassignedIterator.removeAndIgnore();
+            }
+        }
+        return changed;
+    }
+
+    /**
+     * Does the shard need to find a primary copy?
+     */
+    boolean needToFindPrimaryCopy(ShardRouting shard, IndexShardRoutingTable indexShardRoutingTable) {
+        if (shard.primary() == false) {
+            return false;
+        }
+
+        // this is an API allocation, ignore since we know there is no data...
+        if (indexShardRoutingTable.primaryAllocatedPostApi() == false) {
+            return false;
+        }
+
+        return true;
+    }
+
+    private boolean isEnoughAllocationsFound(ShardRouting shard, IndexMetaData indexMetaData, NodesAndVersions nodesAndVersions) {
+        // check if the counts meets the minimum set
+        int requiredAllocation = 1;
+        // if we restore from a repository one copy is more then enough
+        if (shard.restoreSource() == null) {
+            try {
+                String initialShards = indexMetaData.settings().get(INDEX_RECOVERY_INITIAL_SHARDS, settings.get(INDEX_RECOVERY_INITIAL_SHARDS, this.initialShards));
+                if ("quorum".equals(initialShards)) {
+                    if (indexMetaData.numberOfReplicas() > 1) {
+                        requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2) + 1;
+                    }
+                } else if ("quorum-1".equals(initialShards) || "half".equals(initialShards)) {
+                    if (indexMetaData.numberOfReplicas() > 2) {
+                        requiredAllocation = ((1 + indexMetaData.numberOfReplicas()) / 2);
+                    }
+                } else if ("one".equals(initialShards)) {
+                    requiredAllocation = 1;
+                } else if ("full".equals(initialShards) || "all".equals(initialShards)) {
+                    requiredAllocation = indexMetaData.numberOfReplicas() + 1;
+                } else if ("full-1".equals(initialShards) || "all-1".equals(initialShards)) {
+                    if (indexMetaData.numberOfReplicas() > 1) {
+                        requiredAllocation = indexMetaData.numberOfReplicas();
+                    }
+                } else {
+                    requiredAllocation = Integer.parseInt(initialShards);
+                }
+            } catch (Exception e) {
+                logger.warn("[{}][{}] failed to derived initial_shards from value {}, ignore allocation for {}", shard.index(), shard.id(), initialShards, shard);
+            }
+        }
+
+        return nodesAndVersions.allocationsFound >= requiredAllocation;
+    }
+
+    /**
+     * Based on the nodes and versions, build the list of yes/no/throttle nodes that the shard applies to.
+     */
+    private NodesToAllocate buildNodesToAllocate(ShardRouting shard, RoutingAllocation allocation, NodesAndVersions nodesAndVersions) {
+        List<DiscoveryNode> yesNodes = new ArrayList<>();
+        List<DiscoveryNode> throttledNodes = new ArrayList<>();
+        List<DiscoveryNode> noNodes = new ArrayList<>();
+        for (DiscoveryNode discoNode : nodesAndVersions.nodes) {
+            RoutingNode node = allocation.routingNodes().node(discoNode.id());
+            if (node == null) {
+                continue;
+            }
+
+            Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
+            if (decision.type() == Decision.Type.THROTTLE) {
+                throttledNodes.add(discoNode);
+            } else if (decision.type() == Decision.Type.NO) {
+                noNodes.add(discoNode);
+            } else {
+                yesNodes.add(discoNode);
+            }
+        }
+        return new NodesToAllocate(Collections.unmodifiableList(yesNodes), Collections.unmodifiableList(throttledNodes), Collections.unmodifiableList(noNodes));
+    }
+
+    /**
+     * Builds a list of nodes and version
+     */
+    private NodesAndVersions buildNodesAndVersions(ShardRouting shard, boolean recoveryOnAnyNode, Set<String> ignoreNodes,
+                                                   AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> shardState) {
+        final Map<DiscoveryNode, Long> nodesWithVersion = Maps.newHashMap();
+        int numberOfAllocationsFound = 0;
+        long highestVersion = -1;
+        for (TransportNodesListGatewayStartedShards.NodeGatewayStartedShards nodeShardState : shardState.getData().values()) {
+            long version = nodeShardState.version();
+            DiscoveryNode node = nodeShardState.getNode();
+
+            if (ignoreNodes.contains(node.id())) {
+                continue;
+            }
+
+            // -1 version means it does not exists, which is what the API returns, and what we expect to
+            if (nodeShardState.storeException() == null) {
+                logger.trace("[{}] on node [{}] has version [{}] of shard", shard, nodeShardState.getNode(), version);
+            } else {
+                // when there is an store exception, we disregard the reported version and assign it as -1 (same as shard does not exist)
+                logger.trace("[{}] on node [{}] has version [{}] but the store can not be opened, treating as version -1", nodeShardState.storeException(), shard, nodeShardState.getNode(), version);
+                version = -1;
+            }
+
+            if (recoveryOnAnyNode) {
+                numberOfAllocationsFound++;
+                if (version > highestVersion) {
+                    highestVersion = version;
+                }
+                // We always put the node without clearing the map
+                nodesWithVersion.put(node, version);
+            } else if (version != -1) {
+                numberOfAllocationsFound++;
+                // If we've found a new "best" candidate, clear the
+                // current candidates and add it
+                if (version > highestVersion) {
+                    highestVersion = version;
+                    nodesWithVersion.clear();
+                    nodesWithVersion.put(node, version);
+                } else if (version == highestVersion) {
+                    // If the candidate is the same, add it to the
+                    // list, but keep the current candidate
+                    nodesWithVersion.put(node, version);
+                }
+            }
+        }
+        // Now that we have a map of nodes to versions along with the
+        // number of allocations found (and not ignored), we need to sort
+        // it so the node with the highest version is at the beginning
+        List<DiscoveryNode> nodesWithHighestVersion = Lists.newArrayList();
+        nodesWithHighestVersion.addAll(nodesWithVersion.keySet());
+        CollectionUtil.timSort(nodesWithHighestVersion, new Comparator<DiscoveryNode>() {
+            @Override
+            public int compare(DiscoveryNode o1, DiscoveryNode o2) {
+                return Long.compare(nodesWithVersion.get(o2), nodesWithVersion.get(o1));
+            }
+        });
+
+        if (logger.isTraceEnabled()) {
+            StringBuilder sb = new StringBuilder("[");
+            for (DiscoveryNode n : nodesWithVersion.keySet()) {
+                sb.append("[").append(n.getName()).append("]").append(" -> ").append(nodesWithVersion.get(n)).append(", ");
+            }
+            sb.append("]");
+            logger.trace("{} candidates for allocation: {}", shard, sb.toString());
+        }
+
+        return new NodesAndVersions(Collections.unmodifiableList(nodesWithHighestVersion), numberOfAllocationsFound, highestVersion);
+    }
+
+    /**
+     * Return {@code true} if the index is configured to allow shards to be
+     * recovered on any node
+     */
+    private boolean recoverOnAnyNode(@IndexSettings Settings idxSettings) {
+        return IndexMetaData.isOnSharedFilesystem(idxSettings) &&
+                idxSettings.getAsBoolean(IndexMetaData.SETTING_SHARED_FS_ALLOW_RECOVERY_ON_ANY_NODE, false);
+    }
+
+    protected abstract AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetchData(ShardRouting shard, RoutingAllocation allocation);
+
+    static class NodesAndVersions {
+        public final List<DiscoveryNode> nodes;
+        public final int allocationsFound;
+        public final long highestVersion;
+
+        public NodesAndVersions(List<DiscoveryNode> nodes, int allocationsFound, long highestVersion) {
+            this.nodes = nodes;
+            this.allocationsFound = allocationsFound;
+            this.highestVersion = highestVersion;
+        }
+    }
+
+    static class NodesToAllocate {
+        final List<DiscoveryNode> yesNodes;
+        final List<DiscoveryNode> throttleNodes;
+        final List<DiscoveryNode> noNodes;
+
+        public NodesToAllocate(List<DiscoveryNode> yesNodes, List<DiscoveryNode> throttleNodes, List<DiscoveryNode> noNodes) {
+            this.yesNodes = yesNodes;
+            this.throttleNodes = throttleNodes;
+            this.noNodes = noNodes;
+        }
+    }
+}
--- a/core/src/main/java/org/elasticsearch/gateway/ReplicaShardAllocator.java
+++ b/core/src/main/java/org/elasticsearch/gateway/ReplicaShardAllocator.java
@ -0,0 +1,207 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.gateway;
+
+import com.carrotsearch.hppc.cursors.ObjectCursor;
+import org.elasticsearch.cluster.ClusterChangedEvent;
+import org.elasticsearch.cluster.metadata.IndexMetaData;
+import org.elasticsearch.cluster.metadata.MetaData;
+import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.cluster.routing.RoutingNode;
+import org.elasticsearch.cluster.routing.RoutingNodes;
+import org.elasticsearch.cluster.routing.ShardRouting;
+import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
+import org.elasticsearch.cluster.routing.allocation.decider.Decision;
+import org.elasticsearch.common.component.AbstractComponent;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.common.unit.ByteSizeValue;
+import org.elasticsearch.common.unit.TimeValue;
+import org.elasticsearch.index.store.StoreFileMetaData;
+import org.elasticsearch.indices.store.TransportNodesListShardStoreMetaData;
+
+import java.util.Iterator;
+import java.util.Map;
+
+/**
+ */
+public abstract class ReplicaShardAllocator extends AbstractComponent {
+
+    public ReplicaShardAllocator(Settings settings) {
+        super(settings);
+    }
+
+    public boolean allocateUnassigned(RoutingAllocation allocation) {
+        boolean changed = false;
+        final RoutingNodes routingNodes = allocation.routingNodes();
+        final MetaData metaData = routingNodes.metaData();
+
+        final RoutingNodes.UnassignedShards.UnassignedIterator unassignedIterator = routingNodes.unassigned().iterator();
+        while (unassignedIterator.hasNext()) {
+            ShardRouting shard = unassignedIterator.next();
+            if (shard.primary()) {
+                continue;
+            }
+
+            // pre-check if it can be allocated to any node that currently exists, so we won't list the store for it for nothing
+            boolean canBeAllocatedToAtLeastOneNode = false;
+            for (ObjectCursor<DiscoveryNode> cursor : allocation.nodes().dataNodes().values()) {
+                RoutingNode node = routingNodes.node(cursor.value.id());
+                if (node == null) {
+                    continue;
+                }
+                // if we can't allocate it on a node, ignore it, for example, this handles
+                // cases for only allocating a replica after a primary
+                Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
+                if (decision.type() == Decision.Type.YES) {
+                    canBeAllocatedToAtLeastOneNode = true;
+                    break;
+                }
+            }
+
+            if (!canBeAllocatedToAtLeastOneNode) {
+                logger.trace("{}: ignoring allocation, can't be allocated on any node", shard);
+                unassignedIterator.removeAndIgnore();
+                continue;
+            }
+
+            AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> shardStores = fetchData(shard, allocation);
+            if (shardStores.hasData() == false) {
+                logger.trace("{}: ignoring allocation, still fetching shard stores", shard);
+                unassignedIterator.removeAndIgnore();
+                continue; // still fetching
+            }
+
+            long lastSizeMatched = 0;
+            DiscoveryNode lastDiscoNodeMatched = null;
+            RoutingNode lastNodeMatched = null;
+            boolean hasReplicaData = false;
+            IndexMetaData indexMetaData = metaData.index(shard.getIndex());
+
+            for (Map.Entry<DiscoveryNode, TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> nodeStoreEntry : shardStores.getData().entrySet()) {
+                DiscoveryNode discoNode = nodeStoreEntry.getKey();
+                TransportNodesListShardStoreMetaData.StoreFilesMetaData storeFilesMetaData = nodeStoreEntry.getValue().storeFilesMetaData();
+                logger.trace("{}: checking node [{}]", shard, discoNode);
+
+                if (storeFilesMetaData == null) {
+                    // already allocated on that node...
+                    continue;
+                }
+
+                RoutingNode node = routingNodes.node(discoNode.id());
+                if (node == null) {
+                    continue;
+                }
+
+                // check if we can allocate on that node...
+                // we only check for NO, since if this node is THROTTLING and it has enough "same data"
+                // then we will try and assign it next time
+                Decision decision = allocation.deciders().canAllocate(shard, node, allocation);
+                if (decision.type() == Decision.Type.NO) {
+                    continue;
+                }
+
+                // if it is already allocated, we can't assign to it...
+                if (storeFilesMetaData.allocated()) {
+                    continue;
+                }
+
+                if (!shard.primary()) {
+                    hasReplicaData |= storeFilesMetaData.iterator().hasNext();
+                    ShardRouting primaryShard = routingNodes.activePrimary(shard);
+                    if (primaryShard != null) {
+                        assert primaryShard.active();
+                        DiscoveryNode primaryNode = allocation.nodes().get(primaryShard.currentNodeId());
+                        if (primaryNode != null) {
+                            TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData primaryNodeFilesStore = shardStores.getData().get(primaryNode);
+                            if (primaryNodeFilesStore != null) {
+                                TransportNodesListShardStoreMetaData.StoreFilesMetaData primaryNodeStore = primaryNodeFilesStore.storeFilesMetaData();
+                                if (primaryNodeStore != null && primaryNodeStore.allocated()) {
+                                    long sizeMatched = 0;
+
+                                    String primarySyncId = primaryNodeStore.syncId();
+                                    String replicaSyncId = storeFilesMetaData.syncId();
+                                    // see if we have a sync id we can make use of
+                                    if (replicaSyncId != null && replicaSyncId.equals(primarySyncId)) {
+                                        logger.trace("{}: node [{}] has same sync id {} as primary", shard, discoNode.name(), replicaSyncId);
+                                        lastNodeMatched = node;
+                                        lastSizeMatched = Long.MAX_VALUE;
+                                        lastDiscoNodeMatched = discoNode;
+                                    } else {
+                                        for (StoreFileMetaData storeFileMetaData : storeFilesMetaData) {
+                                            String metaDataFileName = storeFileMetaData.name();
+                                            if (primaryNodeStore.fileExists(metaDataFileName) && primaryNodeStore.file(metaDataFileName).isSame(storeFileMetaData)) {
+                                                sizeMatched += storeFileMetaData.length();
+                                            }
+                                        }
+                                        logger.trace("{}: node [{}] has [{}/{}] bytes of re-usable data",
+                                                shard, discoNode.name(), new ByteSizeValue(sizeMatched), sizeMatched);
+                                        if (sizeMatched > lastSizeMatched) {
+                                            lastSizeMatched = sizeMatched;
+                                            lastDiscoNodeMatched = discoNode;
+                                            lastNodeMatched = node;
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+
+            if (lastNodeMatched != null) {
+                // we only check on THROTTLE since we checked before before on NO
+                Decision decision = allocation.deciders().canAllocate(shard, lastNodeMatched, allocation);
+                if (decision.type() == Decision.Type.THROTTLE) {
+                    if (logger.isDebugEnabled()) {
+                        logger.debug("[{}][{}]: throttling allocation [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched));
+                    }
+                    // we are throttling this, but we have enough to allocate to this node, ignore it for now
+                    unassignedIterator.removeAndIgnore();
+                } else {
+                    if (logger.isDebugEnabled()) {
+                        logger.debug("[{}][{}]: allocating [{}] to [{}] in order to reuse its unallocated persistent store with total_size [{}]", shard.index(), shard.id(), shard, lastDiscoNodeMatched, new ByteSizeValue(lastSizeMatched));
+                    }
+                    // we found a match
+                    changed = true;
+                    unassignedIterator.initialize(lastNodeMatched.nodeId());
+                }
+            } else if (hasReplicaData == false) {
+                // if we didn't manage to find *any* data (regardless of matching sizes), check if the allocation
+                // of the replica shard needs to be delayed, and if so, add it to the ignore unassigned list
+                // note: we only care about replica in delayed allocation, since if we have an unassigned primary it
+                //       will anyhow wait to find an existing copy of the shard to be allocated
+                // note: the other side of the equation is scheduling a reroute in a timely manner, which happens in the RoutingService
+                long delay = shard.unassignedInfo().getDelayAllocationExpirationIn(settings, indexMetaData.getSettings());
+                if (delay > 0) {
+                    logger.debug("[{}][{}]: delaying allocation of [{}] for [{}]", shard.index(), shard.id(), shard, TimeValue.timeValueMillis(delay));
+                    /**
+                     * mark it as changed, since we want to kick a publishing to schedule future allocation,
+                     * see {@link org.elasticsearch.cluster.routing.RoutingService#clusterChanged(ClusterChangedEvent)}).
+                     */
+                    changed = true;
+                    unassignedIterator.removeAndIgnore();
+                }
+            }
+        }
+        return changed;
+    }
+
+    protected abstract AsyncShardFetch.FetchResult<TransportNodesListShardStoreMetaData.NodeStoreFilesMetaData> fetchData(ShardRouting shard, RoutingAllocation allocation);
+}
--- a/core/src/main/java/org/elasticsearch/index/search/stats/ShardSearchStats.java
+++ b/core/src/main/java/org/elasticsearch/index/search/stats/ShardSearchStats.java
@ -175,7 +175,7 @@ public final class ShardSearchStats {

    public void onFreeScrollContext(SearchContext context) {
        totalStats.scrollCurrent.dec();
-        totalStats.scrollMetric.inc(TimeUnit.MILLISECONDS.toNanos(System.currentTimeMillis() - context.nowInMillis()));
+        totalStats.scrollMetric.inc(System.nanoTime() - context.getOriginNanoTime());
    }

    public void onRefreshSettings(Settings settings) {
--- a/core/src/main/java/org/elasticsearch/index/settings/IndexDynamicSettingsModule.java
+++ b/core/src/main/java/org/elasticsearch/index/settings/IndexDynamicSettingsModule.java
@ -20,6 +20,7 @@
 package org.elasticsearch.index.settings;

 import org.elasticsearch.cluster.routing.UnassignedInfo;
+import org.elasticsearch.gateway.PrimaryShardAllocator;
 import org.elasticsearch.index.shard.MergeSchedulerConfig;
 import org.elasticsearch.cluster.metadata.IndexMetaData;
 import org.elasticsearch.cluster.routing.allocation.decider.DisableAllocationDecider;
@ -74,7 +75,7 @@ public class IndexDynamicSettingsModule extends AbstractModule {
        indexDynamicSettings.addDynamicSetting(IndexMetaData.SETTING_PRIORITY, Validator.NON_NEGATIVE_INTEGER);
        indexDynamicSettings.addDynamicSetting(IndicesTTLService.INDEX_TTL_DISABLE_PURGE);
        indexDynamicSettings.addDynamicSetting(IndexShard.INDEX_REFRESH_INTERVAL, Validator.TIME);
-        indexDynamicSettings.addDynamicSetting(GatewayAllocator.INDEX_RECOVERY_INITIAL_SHARDS);
+        indexDynamicSettings.addDynamicSetting(PrimaryShardAllocator.INDEX_RECOVERY_INITIAL_SHARDS);
        indexDynamicSettings.addDynamicSetting(EngineConfig.INDEX_COMPOUND_ON_FLUSH, Validator.BOOLEAN);
        indexDynamicSettings.addDynamicSetting(EngineConfig.INDEX_GC_DELETES_SETTING, Validator.TIME);
        indexDynamicSettings.addDynamicSetting(IndexShard.INDEX_FLUSH_ON_CLOSE, Validator.BOOLEAN);
--- a/core/src/main/java/org/elasticsearch/percolator/PercolateContext.java
+++ b/core/src/main/java/org/elasticsearch/percolator/PercolateContext.java
@ -98,6 +98,7 @@ public class PercolateContext extends SearchContext {
    private final ConcurrentMap<BytesRef, Query> percolateQueries;
    private final int numberOfShards;
    private final Query aliasFilter;
+    private final long originNanoTime = System.nanoTime();
    private final long startTime;
    private String[] types;

@ -337,6 +338,11 @@ public class PercolateContext extends SearchContext {
        throw new UnsupportedOperationException();
    }

+    @Override
+    public long getOriginNanoTime() {
+        return originNanoTime;
+    }
+
    @Override
    protected long nowInMillisImpl() {
        return startTime;
--- a/core/src/main/java/org/elasticsearch/rest/action/cat/RestIndicesAction.java
+++ b/core/src/main/java/org/elasticsearch/rest/action/cat/RestIndicesAction.java
@ -250,6 +250,15 @@ public class RestIndicesAction extends AbstractCatAction {
        table.addCell("search.query_total", "sibling:pri;alias:sqto,searchQueryTotal;default:false;text-align:right;desc:total query phase ops");
        table.addCell("pri.search.query_total", "default:false;text-align:right;desc:total query phase ops");

+        table.addCell("search.scroll_current", "sibling:pri;alias:scc,searchScrollCurrent;default:false;text-align:right;desc:open scroll contexts");
+        table.addCell("pri.search.scroll_current", "default:false;text-align:right;desc:open scroll contexts");
+
+        table.addCell("search.scroll_time", "sibling:pri;alias:scti,searchScrollTime;default:false;text-align:right;desc:time scroll contexts held open");
+        table.addCell("pri.search.scroll_time", "default:false;text-align:right;desc:time scroll contexts held open");
+
+        table.addCell("search.scroll_total", "sibling:pri;alias:scto,searchScrollTotal;default:false;text-align:right;desc:completed scroll contexts");
+        table.addCell("pri.search.scroll_total", "default:false;text-align:right;desc:completed scroll contexts");
+
        table.addCell("segments.count", "sibling:pri;alias:sc,segmentsCount;default:false;text-align:right;desc:number of segments");
        table.addCell("pri.segments.count", "default:false;text-align:right;desc:number of segments");

@ -449,6 +458,15 @@ public class RestIndicesAction extends AbstractCatAction {
            table.addCell(indexStats == null ? null : indexStats.getTotal().getSearch().getTotal().getQueryCount());
            table.addCell(indexStats == null ? null : indexStats.getPrimaries().getSearch().getTotal().getQueryCount());

+            table.addCell(indexStats == null ? null : indexStats.getTotal().getSearch().getTotal().getScrollCurrent());
+            table.addCell(indexStats == null ? null : indexStats.getPrimaries().getSearch().getTotal().getScrollCurrent());
+
+            table.addCell(indexStats == null ? null : indexStats.getTotal().getSearch().getTotal().getScrollTime());
+            table.addCell(indexStats == null ? null : indexStats.getPrimaries().getSearch().getTotal().getScrollTime());
+
+            table.addCell(indexStats == null ? null : indexStats.getTotal().getSearch().getTotal().getScrollCount());
+            table.addCell(indexStats == null ? null : indexStats.getPrimaries().getSearch().getTotal().getScrollCount());
+
            table.addCell(indexStats == null ? null : indexStats.getTotal().getSegments().getCount());
            table.addCell(indexStats == null ? null : indexStats.getPrimaries().getSegments().getCount());

--- a/core/src/main/java/org/elasticsearch/rest/action/cat/RestNodesAction.java
+++ b/core/src/main/java/org/elasticsearch/rest/action/cat/RestNodesAction.java
@ -190,6 +190,9 @@ public class RestNodesAction extends AbstractCatAction {
        table.addCell("search.query_current", "alias:sqc,searchQueryCurrent;default:false;text-align:right;desc:current query phase ops");
        table.addCell("search.query_time", "alias:sqti,searchQueryTime;default:false;text-align:right;desc:time spent in query phase");
        table.addCell("search.query_total", "alias:sqto,searchQueryTotal;default:false;text-align:right;desc:total query phase ops");
+        table.addCell("search.scroll_current", "alias:scc,searchScrollCurrent;default:false;text-align:right;desc:open scroll contexts");
+        table.addCell("search.scroll_time", "alias:scti,searchScrollTime;default:false;text-align:right;desc:time scroll contexts held open");
+        table.addCell("search.scroll_total", "alias:scto,searchScrollTotal;default:false;text-align:right;desc:completed scroll contexts");

        table.addCell("segments.count", "alias:sc,segmentsCount;default:false;text-align:right;desc:number of segments");
        table.addCell("segments.memory", "alias:sm,segmentsMemory;default:false;text-align:right;desc:memory used by segments");
@ -322,6 +325,9 @@ public class RestNodesAction extends AbstractCatAction {
            table.addCell(searchStats == null ? null : searchStats.getTotal().getQueryCurrent());
            table.addCell(searchStats == null ? null : searchStats.getTotal().getQueryTime());
            table.addCell(searchStats == null ? null : searchStats.getTotal().getQueryCount());
+            table.addCell(searchStats == null ? null : searchStats.getTotal().getScrollCurrent());
+            table.addCell(searchStats == null ? null : searchStats.getTotal().getScrollTime());
+            table.addCell(searchStats == null ? null : searchStats.getTotal().getScrollCount());

            SegmentsStats segmentsStats = indicesStats == null ? null : indicesStats.getSegments();
            table.addCell(segmentsStats == null ? null : segmentsStats.getCount());
--- a/core/src/main/java/org/elasticsearch/rest/action/cat/RestShardsAction.java
+++ b/core/src/main/java/org/elasticsearch/rest/action/cat/RestShardsAction.java
@ -146,6 +146,9 @@ public class RestShardsAction extends AbstractCatAction {
        table.addCell("search.query_current", "alias:sqc,searchQueryCurrent;default:false;text-align:right;desc:current query phase ops");
        table.addCell("search.query_time", "alias:sqti,searchQueryTime;default:false;text-align:right;desc:time spent in query phase");
        table.addCell("search.query_total", "alias:sqto,searchQueryTotal;default:false;text-align:right;desc:total query phase ops");
+        table.addCell("search.scroll_current", "alias:scc,searchScrollCurrent;default:false;text-align:right;desc:open scroll contexts");
+        table.addCell("search.scroll_time", "alias:scti,searchScrollTime;default:false;text-align:right;desc:time scroll contexts held open");
+        table.addCell("search.scroll_total", "alias:scto,searchScrollTotal;default:false;text-align:right;desc:completed scroll contexts");

        table.addCell("segments.count", "alias:sc,segmentsCount;default:false;text-align:right;desc:number of segments");
        table.addCell("segments.memory", "alias:sm,segmentsMemory;default:false;text-align:right;desc:memory used by segments");
@ -277,6 +280,9 @@ public class RestShardsAction extends AbstractCatAction {
            table.addCell(shardStats == null ? null : shardStats.getSearch().getTotal().getQueryCurrent());
            table.addCell(shardStats == null ? null : shardStats.getSearch().getTotal().getQueryTime());
            table.addCell(shardStats == null ? null : shardStats.getSearch().getTotal().getQueryCount());
+            table.addCell(shardStats == null ? null : shardStats.getSearch().getTotal().getScrollCurrent());
+            table.addCell(shardStats == null ? null : shardStats.getSearch().getTotal().getScrollTime());
+            table.addCell(shardStats == null ? null : shardStats.getSearch().getTotal().getScrollCount());

            table.addCell(shardStats == null ? null : shardStats.getSegments().getCount());
            table.addCell(shardStats == null ? null : shardStats.getSegments().getMemory());
--- a/core/src/main/java/org/elasticsearch/search/internal/DefaultSearchContext.java
+++ b/core/src/main/java/org/elasticsearch/search/internal/DefaultSearchContext.java
@ -122,6 +122,7 @@ public class DefaultSearchContext extends SearchContext {
    private boolean queryRewritten;
    private volatile long keepAlive;
    private ScoreDoc lastEmittedDoc;
+    private final long originNanoTime = System.nanoTime();
    private volatile long lastAccessTime = -1;
    private InnerHitsContext innerHitsContext;

@ -269,6 +270,11 @@ public class DefaultSearchContext extends SearchContext {
        return this;
    }

+    @Override
+    public long getOriginNanoTime() {
+        return originNanoTime;
+    }
+
    @Override
    protected long nowInMillisImpl() {
        return request.nowInMillis();
--- a/core/src/main/java/org/elasticsearch/search/internal/FilteredSearchContext.java
+++ b/core/src/main/java/org/elasticsearch/search/internal/FilteredSearchContext.java
@ -139,6 +139,11 @@ public abstract class FilteredSearchContext extends SearchContext {
        return in.queryBoost(queryBoost);
    }

+    @Override
+    public long getOriginNanoTime() {
+        return in.getOriginNanoTime();
+    }
+
    @Override
    protected long nowInMillisImpl() {
        return in.nowInMillisImpl();
--- a/core/src/main/java/org/elasticsearch/search/internal/SearchContext.java
+++ b/core/src/main/java/org/elasticsearch/search/internal/SearchContext.java
@ -142,6 +142,8 @@ public abstract class SearchContext implements Releasable, HasContextAndHeaders

    public abstract SearchContext queryBoost(float queryBoost);

+    public abstract long getOriginNanoTime();
+
    public final long nowInMillis() {
        nowInMillisUsed = true;
        return nowInMillisImpl();
--- a/core/src/test/java/org/elasticsearch/ExceptionSerializationTests.java
+++ b/core/src/test/java/org/elasticsearch/ExceptionSerializationTests.java
@ -192,17 +192,18 @@ public class ExceptionSerializationTests extends ElasticsearchTestCase {
    }

    public void testIllegalShardRoutingStateException() throws IOException {
-        ShardRouting routing = TestShardRouting.newShardRouting("test", 0, "xyz", "def", false, ShardRoutingState.STARTED, 0);
+        final ShardRouting routing = TestShardRouting.newShardRouting("test", 0, "xyz", "def", false, ShardRoutingState.STARTED, 0);
+        final String routingAsString = routing.toString();
        IllegalShardRoutingStateException serialize = serialize(new IllegalShardRoutingStateException(routing, "foo", new NullPointerException()));
        assertNotNull(serialize.shard());
        assertEquals(routing, serialize.shard());
-        assertEquals("[test][0], node[xyz], relocating [def], [R], s[STARTED]: foo", serialize.getMessage());
+        assertEquals(routingAsString + ": foo", serialize.getMessage());
        assertTrue(serialize.getCause() instanceof NullPointerException);

        serialize = serialize(new IllegalShardRoutingStateException(routing, "bar", null));
        assertNotNull(serialize.shard());
        assertEquals(routing, serialize.shard());
-        assertEquals("[test][0], node[xyz], relocating [def], [R], s[STARTED]: bar", serialize.getMessage());
+        assertEquals(routingAsString + ": bar", serialize.getMessage());
        assertNull(serialize.getCause());
    }

--- a/core/src/test/java/org/elasticsearch/action/admin/indices/segments/IndicesShardStoreRequestTests.java
+++ b/core/src/test/java/org/elasticsearch/action/admin/indices/segments/IndicesShardStoreRequestTests.java
@ -17,7 +17,7 @@
 * under the License.
 */

-package org.elasticsearch.action.admin.indices.segments;
+package org.elasticsearch.action.admin.indices.shards;

 import com.carrotsearch.hppc.cursors.IntObjectCursor;

@ -37,10 +37,10 @@ import org.elasticsearch.index.IndexService;
 import org.elasticsearch.index.shard.IndexShard;
 import org.elasticsearch.indices.IndicesService;
 import org.elasticsearch.test.ElasticsearchIntegrationTest;
+import org.elasticsearch.test.junit.annotations.TestLogging;
 import org.elasticsearch.test.store.MockFSDirectoryService;
 import org.junit.Test;

-import java.io.IOException;
 import java.util.*;
 import java.util.concurrent.ExecutionException;

@ -58,6 +58,7 @@ public class IndicesShardStoreRequestTests extends ElasticsearchIntegrationTest
    }

    @Test
+    @TestLogging("action.admin.indices.shards:TRACE,cluster.service:TRACE")
    public void testBasic() throws Exception {
        String index = "test";
        internalCluster().ensureAtLeastNumDataNodes(2);
@ -91,10 +92,13 @@ public class IndicesShardStoreRequestTests extends ElasticsearchIntegrationTest
        disableAllocation(index);
        logger.info("--> stop random node");
        internalCluster().stopRandomNode(new IndexNodePredicate(index));
+        ensureYellow(index);
+        ClusterState clusterState = client().admin().cluster().prepareState().get().getState();
+        List<ShardRouting> unassignedShards = clusterState.routingTable().index(index).shardsWithState(ShardRoutingState.UNASSIGNED);
        response = client().admin().indices().shardStores(Requests.indicesShardStoresRequest(index)).get();
        assertThat(response.getStoreStatuses().containsKey(index), equalTo(true));
        ImmutableOpenIntMap<List<IndicesShardStoresResponse.StoreStatus>> shardStoresStatuses = response.getStoreStatuses().get(index);
-        assertThat(shardStoresStatuses.size(), greaterThan(0));
+        assertThat(shardStoresStatuses.size(), equalTo(unassignedShards.size()));
        for (IntObjectCursor<List<IndicesShardStoresResponse.StoreStatus>> storesStatus : shardStoresStatuses) {
            assertThat("must report for one store", storesStatus.value.size(), equalTo(1));
            assertThat("reported store should be primary", storesStatus.value.get(0).getAllocation(), equalTo(IndicesShardStoresResponse.StoreStatus.Allocation.PRIMARY));
--- a/core/src/test/java/org/elasticsearch/action/admin/indices/segments/IndicesShardStoreResponseTest.java
+++ b/core/src/test/java/org/elasticsearch/action/admin/indices/segments/IndicesShardStoreResponseTest.java
@ -17,7 +17,7 @@
 * under the License.
 */

-package org.elasticsearch.action.admin.indices.segments;
+package org.elasticsearch.action.admin.indices.shards;

 import com.google.common.collect.ImmutableList;
 import org.apache.lucene.index.CorruptIndexException;
--- a/core/src/test/java/org/elasticsearch/client/transport/TransportClientRetryTests.java
+++ b/core/src/test/java/org/elasticsearch/client/transport/TransportClientRetryTests.java
@ -32,6 +32,7 @@ import org.elasticsearch.node.internal.InternalSettingsPreparer;
 import org.elasticsearch.plugins.PluginsService;
 import org.elasticsearch.test.ElasticsearchIntegrationTest;
 import org.elasticsearch.test.InternalTestCluster;
+import org.elasticsearch.test.junit.annotations.TestLogging;
 import org.elasticsearch.transport.TransportService;
 import org.junit.Test;

@ -45,6 +46,7 @@ import static org.hamcrest.CoreMatchers.equalTo;
 import static org.hamcrest.Matchers.greaterThanOrEqualTo;

@ClusterScope(scope = Scope.TEST, numClientNodes = 0)
+@TestLogging("discovery.zen:TRACE")
 public class TransportClientRetryTests extends ElasticsearchIntegrationTest {

    @Test
--- a/core/src/test/java/org/elasticsearch/cluster/action/shard/ShardStateActionTest.java
+++ b/core/src/test/java/org/elasticsearch/cluster/action/shard/ShardStateActionTest.java
@ -1,90 +0,0 @@
-/*
- * Licensed to Elasticsearch under one or more contributor
- * license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright
- * ownership. Elasticsearch licenses this file to you under
- * the Apache License, Version 2.0 (the "License"); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-package org.elasticsearch.cluster.action.shard;
-
-import org.elasticsearch.Version;
-import org.elasticsearch.cluster.ClusterName;
-import org.elasticsearch.cluster.ClusterState;
-import org.elasticsearch.cluster.metadata.IndexMetaData;
-import org.elasticsearch.cluster.metadata.MetaData;
-import org.elasticsearch.cluster.node.DiscoveryNode;
-import org.elasticsearch.cluster.node.DiscoveryNodes;
-import org.elasticsearch.cluster.routing.*;
-import org.elasticsearch.common.settings.Settings;
-import org.elasticsearch.common.transport.DummyTransportAddress;
-import org.elasticsearch.test.ElasticsearchTestCase;
-
-import java.util.ArrayList;
-import java.util.List;
-
-import static org.hamcrest.Matchers.equalTo;
-
-
-public class ShardStateActionTest extends ElasticsearchTestCase {
-
-    public void testShardFiltering() {
-        final IndexMetaData indexMetaData = IndexMetaData.builder("test")
-                .settings(Settings.builder()
-                        .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
-                        .put(IndexMetaData.SETTING_INDEX_UUID, "test_uuid"))
-                .numberOfShards(2).numberOfReplicas(0)
-                .build();
-        ClusterState.Builder stateBuilder = ClusterState.builder(ClusterName.DEFAULT)
-                .nodes(DiscoveryNodes.builder()
-                                .put(new DiscoveryNode("node1", DummyTransportAddress.INSTANCE, Version.CURRENT)).masterNodeId("node1")
-                                .put(new DiscoveryNode("node2", DummyTransportAddress.INSTANCE, Version.CURRENT))
-                )
-                .metaData(MetaData.builder().put(indexMetaData, false));
-
-        final ShardRouting initShard = TestShardRouting.newShardRouting("test", 0, "node1", randomBoolean(), ShardRoutingState.INITIALIZING, 1);
-        final ShardRouting startedShard = TestShardRouting.newShardRouting("test", 1, "node2", randomBoolean(), ShardRoutingState.STARTED, 1);
-        final ShardRouting relocatingShard = TestShardRouting.newShardRouting("test", 2, "node1", "node2", randomBoolean(), ShardRoutingState.RELOCATING, 1);
-        stateBuilder.routingTable(RoutingTable.builder().add(IndexRoutingTable.builder("test")
-                .addIndexShard(new IndexShardRoutingTable.Builder(initShard.shardId(), true).addShard(initShard).build())
-                .addIndexShard(new IndexShardRoutingTable.Builder(startedShard.shardId(), true).addShard(startedShard).build())
-                .addIndexShard(new IndexShardRoutingTable.Builder(relocatingShard.shardId(), true).addShard(relocatingShard).build())));
-
-        ClusterState state = stateBuilder.build();
-
-        ArrayList<ShardStateAction.ShardRoutingEntry> listToFilter = new ArrayList<>();
-        ArrayList<ShardStateAction.ShardRoutingEntry> expectedToBeApplied = new ArrayList<>();
-
-        listToFilter.add(new ShardStateAction.ShardRoutingEntry(initShard, indexMetaData.indexUUID() + "_suffix", "wrong_uuid", null));
-
-        listToFilter.add(new ShardStateAction.ShardRoutingEntry(relocatingShard.buildTargetRelocatingShard(), indexMetaData.indexUUID(), "relocating_to_node", null));
-        expectedToBeApplied.add(listToFilter.get(listToFilter.size() - 1));
-
-        listToFilter.add(new ShardStateAction.ShardRoutingEntry(startedShard, indexMetaData.indexUUID(), "started shard", null));
-        expectedToBeApplied.add(listToFilter.get(listToFilter.size() - 1));
-
-        listToFilter.add(new ShardStateAction.ShardRoutingEntry(TestShardRouting.newShardRouting(initShard.index() + "_NA", initShard.id(),
-                initShard.currentNodeId(), initShard.primary(), initShard.state(), initShard.version()), indexMetaData.indexUUID(), "wrong_uuid", null));
-
-        List<ShardStateAction.ShardRoutingEntry> toBeApplied = ShardStateAction.extractShardsToBeApplied(listToFilter, "for testing", state.metaData(), logger);
-        if (toBeApplied.size() != expectedToBeApplied.size()) {
-            fail("size mismatch.\n Got: \n [" + toBeApplied + "], \n expected: \n [" + expectedToBeApplied + "]");
-        }
-        for (int i = 0; i < toBeApplied.size(); i++) {
-            final ShardStateAction.ShardRoutingEntry found = toBeApplied.get(i);
-            final ShardStateAction.ShardRoutingEntry expected = expectedToBeApplied.get(i);
-            assertThat(found, equalTo(expected));
-        }
-    }
-}
--- a/core/src/test/java/org/elasticsearch/cluster/routing/AllocationIdTests.java
+++ b/core/src/test/java/org/elasticsearch/cluster/routing/AllocationIdTests.java
@ -62,16 +62,15 @@ public class AllocationIdTests extends ElasticsearchTestCase {
        assertThat(shard.allocationId(), not(equalTo(allocationId)));
        assertThat(shard.allocationId().getId(), equalTo(allocationId.getId()));
        assertThat(shard.allocationId().getRelocationId(), notNullValue());
-        allocationId = shard.allocationId();

        ShardRouting target = shard.buildTargetRelocatingShard();
        assertThat(target.allocationId().getId(), equalTo(shard.allocationId().getRelocationId()));
-        assertThat(target.allocationId().getRelocationId(), nullValue());
+        assertThat(target.allocationId().getRelocationId(), equalTo(shard.allocationId().getId()));

        logger.info("-- finalize the relocation");
-        shard.moveToStarted();
-        assertThat(shard.allocationId().getId(), equalTo(target.allocationId().getId()));
-        assertThat(shard.allocationId().getRelocationId(), nullValue());
+        target.moveToStarted();
+        assertThat(target.allocationId().getId(), equalTo(shard.allocationId().getRelocationId()));
+        assertThat(target.allocationId().getRelocationId(), nullValue());
    }

    @Test
--- a/core/src/test/java/org/elasticsearch/cluster/routing/TestShardRouting.java
+++ b/core/src/test/java/org/elasticsearch/cluster/routing/TestShardRouting.java
@ -33,6 +33,10 @@ public class TestShardRouting {
        return new ShardRouting(index, shardId, currentNodeId, relocatingNodeId, null, primary, state, version, null, buildAllocationId(state), true);
    }

+    public static ShardRouting newShardRouting(String index, int shardId, String currentNodeId, String relocatingNodeId, boolean primary, ShardRoutingState state, AllocationId allocationId, long version) {
+        return new ShardRouting(index, shardId, currentNodeId, relocatingNodeId, null, primary, state, version, null, allocationId, true);
+    }
+
    public static ShardRouting newShardRouting(String index, int shardId, String currentNodeId, String relocatingNodeId, RestoreSource restoreSource, boolean primary, ShardRoutingState state, long version) {
        return new ShardRouting(index, shardId, currentNodeId, relocatingNodeId, restoreSource, primary, state, version, null, buildAllocationId(state), true);
    }
--- a/core/src/test/java/org/elasticsearch/cluster/routing/allocation/FailedShardsRoutingTests.java
+++ b/core/src/test/java/org/elasticsearch/cluster/routing/allocation/FailedShardsRoutingTests.java
@ -19,6 +19,7 @@

 package org.elasticsearch.cluster.routing.allocation;

+import com.carrotsearch.randomizedtesting.annotations.Repeat;
 import com.google.common.collect.ImmutableList;
 import org.elasticsearch.Version;
 import org.elasticsearch.cluster.ClusterState;
@ -256,9 +257,9 @@ public class FailedShardsRoutingTests extends ElasticsearchAllocationTestCase {

        logger.info("fail the first shard, will have no place to be rerouted to (single node), so stays unassigned");
        prevRoutingTable = routingTable;
-        routingTable = strategy.applyFailedShard(clusterState, TestShardRouting.newShardRouting("test", 0, "node1", true, INITIALIZING, 0)).routingTable();
+        ShardRouting firstShard = clusterState.routingNodes().node("node1").get(0);
+        routingTable = strategy.applyFailedShard(clusterState, firstShard).routingTable();
        clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
-        RoutingNodes routingNodes = clusterState.routingNodes();

        assertThat(prevRoutingTable != routingTable, equalTo(true));
        assertThat(routingTable.index("test").shards().size(), equalTo(1));
@ -272,7 +273,7 @@ public class FailedShardsRoutingTests extends ElasticsearchAllocationTestCase {
        }

        logger.info("fail the shard again, see that nothing happens");
-        assertThat(strategy.applyFailedShard(clusterState, TestShardRouting.newShardRouting("test", 0, "node1", true, INITIALIZING, 0)).changed(), equalTo(false));
+        assertThat(strategy.applyFailedShard(clusterState, firstShard).changed(), equalTo(false));
    }

    @Test
@ -371,11 +372,14 @@ public class FailedShardsRoutingTests extends ElasticsearchAllocationTestCase {

        logger.info("fail the first shard, will start INITIALIZING on the second node");
        prevRoutingTable = routingTable;
-        routingTable = strategy.applyFailedShard(clusterState, TestShardRouting.newShardRouting("test", 0, nodeHoldingPrimary, true, INITIALIZING, 0)).routingTable();
+        final ShardRouting firstShard = clusterState.routingNodes().node(nodeHoldingPrimary).get(0);
+        routingTable = strategy.applyFailedShard(clusterState, firstShard).routingTable();
        clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
-        RoutingNodes routingNodes = clusterState.routingNodes();
-
        assertThat(prevRoutingTable != routingTable, equalTo(true));
+
+        final String nodeHoldingPrimary2 = routingTable.index("test").shard(0).primaryShard().currentNodeId();
+        assertThat(nodeHoldingPrimary2, not(equalTo(nodeHoldingPrimary)));
+
        assertThat(routingTable.index("test").shards().size(), equalTo(1));
        for (int i = 0; i < routingTable.index("test").shards().size(); i++) {
            assertThat(routingTable.index("test").shard(i).size(), equalTo(2));
@ -387,7 +391,7 @@ public class FailedShardsRoutingTests extends ElasticsearchAllocationTestCase {
        }

        logger.info("fail the shard again, see that nothing happens");
-        assertThat(strategy.applyFailedShard(clusterState, TestShardRouting.newShardRouting("test", 0, nodeHoldingPrimary, true, INITIALIZING, 0)).changed(), equalTo(false));
+        assertThat(strategy.applyFailedShard(clusterState, firstShard).changed(), equalTo(false));
    }

    @Test
--- a/core/src/test/java/org/elasticsearch/cluster/routing/allocation/StartedShardsRoutingTests.java
+++ b/core/src/test/java/org/elasticsearch/cluster/routing/allocation/StartedShardsRoutingTests.java
@ -64,68 +64,57 @@ public class StartedShardsRoutingTests extends ElasticsearchAllocationTestCase {

        RoutingAllocation.Result result = allocation.applyStartedShards(state, Arrays.asList(
                TestShardRouting.newShardRouting(initShard.index(), initShard.id(), initShard.currentNodeId(), initShard.relocatingNodeId(), initShard.primary(),
-                        ShardRoutingState.INITIALIZING, randomInt())), false);
+                        ShardRoutingState.INITIALIZING, initShard.allocationId(), randomInt())), false);
        assertTrue("failed to start " + initShard + "\ncurrent routing table:" + result.routingTable().prettyPrint(), result.changed());
        assertTrue(initShard + "isn't started \ncurrent routing table:" + result.routingTable().prettyPrint(),
                result.routingTable().index("test").shard(initShard.id()).allShardsStarted());


-        logger.info("--> testing shard variants that shouldn't match the started shard");
+        logger.info("--> testing shard variants that shouldn't match the initializing shard");

        result = allocation.applyStartedShards(state, Arrays.asList(
-                TestShardRouting.newShardRouting(initShard.index(), initShard.id(), initShard.currentNodeId(), initShard.relocatingNodeId(), !initShard.primary(),
+                TestShardRouting.newShardRouting(initShard.index(), initShard.id(), initShard.currentNodeId(), initShard.relocatingNodeId(), initShard.primary(),
                        ShardRoutingState.INITIALIZING, 1)), false);
-        assertFalse("wrong primary flag shouldn't start shard " + initShard + "\ncurrent routing table:" + result.routingTable().prettyPrint(), result.changed());
+        assertFalse("wrong allocation id flag shouldn't start shard " + initShard + "\ncurrent routing table:" + result.routingTable().prettyPrint(), result.changed());

        result = allocation.applyStartedShards(state, Arrays.asList(
                TestShardRouting.newShardRouting(initShard.index(), initShard.id(), "some_node", initShard.currentNodeId(), initShard.primary(),
-                        ShardRoutingState.INITIALIZING, 1)), false);
+                        ShardRoutingState.INITIALIZING, AllocationId.newTargetRelocation(AllocationId.newRelocation(initShard.allocationId()))
+                        , 1)), false);
        assertFalse("relocating shard from node shouldn't start shard " + initShard + "\ncurrent routing table:" + result.routingTable().prettyPrint(), result.changed());

-        result = allocation.applyStartedShards(state, Arrays.asList(
-                TestShardRouting.newShardRouting(initShard.index(), initShard.id(), initShard.currentNodeId(), "some_node", initShard.primary(),
-                        ShardRoutingState.INITIALIZING, 1)), false);
-        assertFalse("relocating shard to node shouldn't start shard " + initShard + "\ncurrent routing table:" + result.routingTable().prettyPrint(), result.changed());


        logger.info("--> testing double starting");

        result = allocation.applyStartedShards(state, Arrays.asList(
                TestShardRouting.newShardRouting(startedShard.index(), startedShard.id(), startedShard.currentNodeId(), startedShard.relocatingNodeId(), startedShard.primary(),
-                        ShardRoutingState.INITIALIZING, 1)), false);
+                        ShardRoutingState.INITIALIZING, startedShard.allocationId(), 1)), false);
        assertFalse("duplicate starting of the same shard should be ignored \ncurrent routing table:" + result.routingTable().prettyPrint(), result.changed());

        logger.info("--> testing starting of relocating shards");
+        final AllocationId targetAllocationId = AllocationId.newTargetRelocation(relocatingShard.allocationId());
        result = allocation.applyStartedShards(state, Arrays.asList(
                TestShardRouting.newShardRouting(relocatingShard.index(), relocatingShard.id(), relocatingShard.relocatingNodeId(), relocatingShard.currentNodeId(), relocatingShard.primary(),
-                        ShardRoutingState.INITIALIZING, randomInt())), false);
+                        ShardRoutingState.INITIALIZING, targetAllocationId, randomInt())), false);
+
        assertTrue("failed to start " + relocatingShard + "\ncurrent routing table:" + result.routingTable().prettyPrint(), result.changed());
        ShardRouting shardRouting = result.routingTable().index("test").shard(relocatingShard.id()).getShards().get(0);
        assertThat(shardRouting.state(), equalTo(ShardRoutingState.STARTED));
        assertThat(shardRouting.currentNodeId(), equalTo("node2"));
        assertThat(shardRouting.relocatingNodeId(), nullValue());

-        logger.info("--> testing shard variants that shouldn't match the relocating shard");
+        logger.info("--> testing shard variants that shouldn't match the initializing relocating shard");

        result = allocation.applyStartedShards(state, Arrays.asList(
-                TestShardRouting.newShardRouting(relocatingShard.index(), relocatingShard.id(), relocatingShard.relocatingNodeId(), relocatingShard.currentNodeId(), !relocatingShard.primary(),
-                        ShardRoutingState.INITIALIZING, 1)), false);
-        assertFalse("wrong primary flag shouldn't start shard " + relocatingShard + "\ncurrent routing table:" + result.routingTable().prettyPrint(), result.changed());
+                TestShardRouting.newShardRouting(relocatingShard.index(), relocatingShard.id(), relocatingShard.relocatingNodeId(), relocatingShard.currentNodeId(), relocatingShard.primary(),
+                        ShardRoutingState.INITIALIZING, relocatingShard.version())));
+        assertFalse("wrong allocation id shouldn't start shard" + relocatingShard + "\ncurrent routing table:" + result.routingTable().prettyPrint(), result.changed());

        result = allocation.applyStartedShards(state, Arrays.asList(
-                TestShardRouting.newShardRouting(relocatingShard.index(), relocatingShard.id(), "some_node", relocatingShard.currentNodeId(), relocatingShard.primary(),
-                        ShardRoutingState.INITIALIZING, 1)), false);
-        assertFalse("relocating shard to a different node shouldn't start shard " + relocatingShard + "\ncurrent routing table:" + result.routingTable().prettyPrint(), result.changed());
-
-        result = allocation.applyStartedShards(state, Arrays.asList(
-                TestShardRouting.newShardRouting(relocatingShard.index(), relocatingShard.id(), relocatingShard.relocatingNodeId(), "some_node", relocatingShard.primary(),
-                        ShardRoutingState.INITIALIZING, 1)), false);
-        assertFalse("relocating shard from a different node shouldn't start shard " + relocatingShard + "\ncurrent routing table:" + result.routingTable().prettyPrint(), result.changed());
-
-        result = allocation.applyStartedShards(state, Arrays.asList(
-                TestShardRouting.newShardRouting(relocatingShard.index(), relocatingShard.id(), relocatingShard.relocatingNodeId(), relocatingShard.primary(),
-                        ShardRoutingState.INITIALIZING, 1)), false);
-        assertFalse("non-relocating shard shouldn't start shard" + relocatingShard + "\ncurrent routing table:" + result.routingTable().prettyPrint(), result.changed());
+                TestShardRouting.newShardRouting(relocatingShard.index(), relocatingShard.id(), relocatingShard.relocatingNodeId(), relocatingShard.currentNodeId(), relocatingShard.primary(),
+                        ShardRoutingState.INITIALIZING, relocatingShard.allocationId(), randomInt())), false);
+        assertFalse("wrong allocation id shouldn't start shard even if relocatingId==shard.id" + relocatingShard + "\ncurrent routing table:" + result.routingTable().prettyPrint(), result.changed());

    }
 }
--- a/core/src/test/java/org/elasticsearch/common/util/concurrent/PrioritizedExecutorsTests.java
+++ b/core/src/test/java/org/elasticsearch/common/util/concurrent/PrioritizedExecutorsTests.java
@ -244,7 +244,7 @@ public class PrioritizedExecutorsTests extends ElasticsearchTestCase {
    @Test
    public void testTimeoutCleanup() throws Exception {
        ThreadPool threadPool = new ThreadPool("test");
-        ScheduledThreadPoolExecutor timer = (ScheduledThreadPoolExecutor) threadPool.scheduler();
+        final ScheduledThreadPoolExecutor timer = (ScheduledThreadPoolExecutor) threadPool.scheduler();
        final AtomicBoolean timeoutCalled = new AtomicBoolean();
        PrioritizedEsThreadPoolExecutor executor = EsExecutors.newSinglePrioritizing(EsExecutors.daemonThreadFactory(getTestName()));
        final CountDownLatch invoked = new CountDownLatch(1);
@ -253,7 +253,7 @@ public class PrioritizedExecutorsTests extends ElasticsearchTestCase {
                             public void run() {
                                 invoked.countDown();
                             }
-                         }, timer, TimeValue.timeValueMillis(1000), new Runnable() {
+                         }, timer, TimeValue.timeValueHours(1), new Runnable() {
                    @Override
                    public void run() {
                        // We should never get here
@ -262,7 +262,15 @@ public class PrioritizedExecutorsTests extends ElasticsearchTestCase {
                }
        );
        invoked.await();
-        assertThat(timer.getQueue().size(), equalTo(0));
+
+        // the timeout handler is added post execution (and quickly cancelled). We have allow for this
+        // and use assert busy
+        assertBusy(new Runnable() {
+            @Override
+            public void run() {
+                assertThat(timer.getQueue().size(), equalTo(0));
+            }
+        }, 5, TimeUnit.SECONDS);
        assertThat(timeoutCalled.get(), equalTo(false));
        assertTrue(terminate(executor));
        assertTrue(terminate(threadPool));
--- a/core/src/test/java/org/elasticsearch/gateway/PrimaryShardAllocatorTests.java
+++ b/core/src/test/java/org/elasticsearch/gateway/PrimaryShardAllocatorTests.java
@ -0,0 +1,319 @@
+/*
+ * Licensed to Elasticsearch under one or more contributor
+ * license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
+ * ownership. Elasticsearch licenses this file to you under
+ * the Apache License, Version 2.0 (the "License"); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.elasticsearch.gateway;
+
+import org.apache.lucene.index.CorruptIndexException;
+import org.elasticsearch.Version;
+import org.elasticsearch.cluster.ClusterState;
+import org.elasticsearch.cluster.metadata.IndexMetaData;
+import org.elasticsearch.cluster.metadata.MetaData;
+import org.elasticsearch.cluster.metadata.SnapshotId;
+import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.cluster.node.DiscoveryNodes;
+import org.elasticsearch.cluster.routing.*;
+import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
+import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders;
+import org.elasticsearch.common.Nullable;
+import org.elasticsearch.common.settings.Settings;
+import org.elasticsearch.index.shard.ShardId;
+import org.elasticsearch.test.ElasticsearchAllocationTestCase;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import static org.hamcrest.Matchers.anyOf;
+import static org.hamcrest.Matchers.equalTo;
+
+/**
+ */
+public class PrimaryShardAllocatorTests extends ElasticsearchAllocationTestCase {
+
+    private final ShardId shardId = new ShardId("test", 0);
+    private final DiscoveryNode node1 = newNode("node1");
+    private final DiscoveryNode node2 = newNode("node2");
+    private final DiscoveryNode node3 = newNode("node3");
+    private TestAllocator testAllocator;
+
+    @Before
+    public void buildTestAllocator() {
+        this.testAllocator = new TestAllocator();
+    }
+
+    /**
+     * Verifies that the canProcess method of primary allocation behaves correctly
+     * and processes only the applicable shard.
+     */
+    @Test
+    public void testNoProcessReplica() {
+        ShardRouting shard = TestShardRouting.newShardRouting("test", 0, null, null, null, false, ShardRoutingState.UNASSIGNED, 0, new UnassignedInfo(UnassignedInfo.Reason.CLUSTER_RECOVERED, null));
+        assertThat(testAllocator.needToFindPrimaryCopy(shard, null), equalTo(false));
+    }
+
+    /**
+     * Tests that when async fetch returns that there is no data, the shard will not be allocated.
+     */
+    @Test
+    public void testNoAsyncFetchData() {
+        RoutingAllocation allocation = routingAllocationWithOnePrimaryNoReplicas(yesAllocationDeciders());
+        boolean changed = testAllocator.allocateUnassigned(allocation);
+        assertThat(changed, equalTo(false));
+        assertThat(allocation.routingNodes().unassigned().ignored().size(), equalTo(1));
+        assertThat(allocation.routingNodes().unassigned().ignored().get(0).shardId(), equalTo(shardId));
+    }
+
+    /**
+     * Tests when the node returns that no data was found for it (-1), it will be moved to ignore unassigned.
+     */
+    @Test
+    public void testNoAllocationFound() {
+        RoutingAllocation allocation = routingAllocationWithOnePrimaryNoReplicas(yesAllocationDeciders());
+        testAllocator.addData(node1, -1);
+        boolean changed = testAllocator.allocateUnassigned(allocation);
+        assertThat(changed, equalTo(false));
+        assertThat(allocation.routingNodes().unassigned().ignored().size(), equalTo(1));
+        assertThat(allocation.routingNodes().unassigned().ignored().get(0).shardId(), equalTo(shardId));
+    }
+
+    /**
+     * Tests when the node returns that no data was found for it (-1), it will be moved to ignore unassigned.
+     */
+    @Test
+    public void testStoreException() {
+        RoutingAllocation allocation = routingAllocationWithOnePrimaryNoReplicas(yesAllocationDeciders());
+        testAllocator.addData(node1, 3, new CorruptIndexException("test", "test"));
+        boolean changed = testAllocator.allocateUnassigned(allocation);
+        assertThat(changed, equalTo(false));
+        assertThat(allocation.routingNodes().unassigned().ignored().size(), equalTo(1));
+        assertThat(allocation.routingNodes().unassigned().ignored().get(0).shardId(), equalTo(shardId));
+    }
+
+    /**
+     * Tests that when there is a node to allocate the shard to, it will be allocated to it.
+     */
+    @Test
+    public void testFoundAllocationAndAllocating() {
+        RoutingAllocation allocation = routingAllocationWithOnePrimaryNoReplicas(yesAllocationDeciders());
+        testAllocator.addData(node1, 10);
+        boolean changed = testAllocator.allocateUnassigned(allocation);
+        assertThat(changed, equalTo(true));
+        assertThat(allocation.routingNodes().unassigned().ignored().isEmpty(), equalTo(true));
+        assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(1));
+        assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).currentNodeId(), equalTo(node1.id()));
+    }
+
+    /**
+     * Tests that when there is a node to allocate to, but it is throttling (and it is the only one),
+     * it will be moved to ignore unassigned until it can be allocated to.
+     */
+    @Test
+    public void testFoundAllocationButThrottlingDecider() {
+        RoutingAllocation allocation = routingAllocationWithOnePrimaryNoReplicas(throttleAllocationDeciders());
+        testAllocator.addData(node1, 10);
+        boolean changed = testAllocator.allocateUnassigned(allocation);
+        assertThat(changed, equalTo(false));
+        assertThat(allocation.routingNodes().unassigned().ignored().size(), equalTo(1));
+        assertThat(allocation.routingNodes().unassigned().ignored().get(0).shardId(), equalTo(shardId));
+    }
+
+    /**
+     * Tests that when there is a node to be allocated to, but it the decider said "no", we still
+     * force the allocation to it.
+     */
+    @Test
+    public void testFoundAllocationButNoDecider() {
+        RoutingAllocation allocation = routingAllocationWithOnePrimaryNoReplicas(noAllocationDeciders());
+        testAllocator.addData(node1, 10);
+        boolean changed = testAllocator.allocateUnassigned(allocation);
+        assertThat(changed, equalTo(true));
+        assertThat(allocation.routingNodes().unassigned().ignored().isEmpty(), equalTo(true));
+        assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(1));
+        assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).currentNodeId(), equalTo(node1.id()));
+    }
+
+    /**
+     * Tests that the highest version node is chosed for allocation.
+     */
+    @Test
+    public void testAllocateToTheHighestVersion() {
+        RoutingAllocation allocation = routingAllocationWithOnePrimaryNoReplicas(yesAllocationDeciders());
+        testAllocator.addData(node1, 10).addData(node2, 12);
+        boolean changed = testAllocator.allocateUnassigned(allocation);
+        assertThat(changed, equalTo(true));
+        assertThat(allocation.routingNodes().unassigned().ignored().isEmpty(), equalTo(true));
+        assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(1));
+        assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).currentNodeId(), equalTo(node2.id()));
+    }
+
+    /**
+     * Tests that when restoring from snapshot, even if we didn't find any node to allocate on, the shard
+     * will remain in the unassigned list to be allocated later.
+     */
+    @Test
+    public void testRestoreIgnoresNoNodesToAllocate() {
+        MetaData metaData = MetaData.builder()
+                .put(IndexMetaData.builder(shardId.getIndex()).settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(0))
+                .build();
+        RoutingTable routingTable = RoutingTable.builder()
+                .addAsRestore(metaData.index(shardId.getIndex()), new RestoreSource(new SnapshotId("test", "test"), Version.CURRENT, shardId.getIndex()))
+                .build();
+        ClusterState state = ClusterState.builder(org.elasticsearch.cluster.ClusterName.DEFAULT)
+                .metaData(metaData)
+                .routingTable(routingTable)
+                .nodes(DiscoveryNodes.builder().put(node1).put(node2).put(node3)).build();
+        RoutingAllocation allocation = new RoutingAllocation(yesAllocationDeciders(), state.routingNodes(), state.nodes(), null);
+
+        testAllocator.addData(node1, -1).addData(node2, -1);
+        boolean changed = testAllocator.allocateUnassigned(allocation);
+        assertThat(changed, equalTo(false));
+        assertThat(allocation.routingNodes().unassigned().ignored().isEmpty(), equalTo(true));
+    }
+
+    /**
+     * Tests that only when enough copies of the shard exists we are going to allocate it. This test
+     * verifies that with same version (1), and quorum allocation.
+     */
+    @Test
+    public void testEnoughCopiesFoundForAllocation() {
+        MetaData metaData = MetaData.builder()
+                .put(IndexMetaData.builder(shardId.getIndex()).settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(2))
+                .build();
+        RoutingTable routingTable = RoutingTable.builder()
+                .addAsRecovery(metaData.index(shardId.getIndex()))
+                .build();
+        ClusterState state = ClusterState.builder(org.elasticsearch.cluster.ClusterName.DEFAULT)
+                .metaData(metaData)
+                .routingTable(routingTable)
+                .nodes(DiscoveryNodes.builder().put(node1).put(node2).put(node3)).build();
+
+        RoutingAllocation allocation = new RoutingAllocation(yesAllocationDeciders(), state.routingNodes(), state.nodes(), null);
+        boolean changed = testAllocator.allocateUnassigned(allocation);
+        assertThat(changed, equalTo(false));
+        assertThat(allocation.routingNodes().unassigned().ignored().size(), equalTo(1));
+        assertThat(allocation.routingNodes().unassigned().ignored().get(0).shardId(), equalTo(shardId));
+        assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.UNASSIGNED).size(), equalTo(2)); // replicas
+
+        testAllocator.addData(node1, 1);
+        allocation = new RoutingAllocation(yesAllocationDeciders(), state.routingNodes(), state.nodes(), null);
+        changed = testAllocator.allocateUnassigned(allocation);
+        assertThat(changed, equalTo(false));
+        assertThat(allocation.routingNodes().unassigned().ignored().size(), equalTo(1));
+        assertThat(allocation.routingNodes().unassigned().ignored().get(0).shardId(), equalTo(shardId));
+        assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.UNASSIGNED).size(), equalTo(2)); // replicas
+
+        testAllocator.addData(node2, 1);
+        allocation = new RoutingAllocation(yesAllocationDeciders(), state.routingNodes(), state.nodes(), null);
+        changed = testAllocator.allocateUnassigned(allocation);
+        assertThat(changed, equalTo(true));
+        assertThat(allocation.routingNodes().unassigned().ignored().size(), equalTo(0));
+        assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.UNASSIGNED).size(), equalTo(2)); // replicas
+        assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(1));
+        assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).currentNodeId(), anyOf(equalTo(node2.id()), equalTo(node1.id())));
+    }
+
+    /**
+     * Tests that only when enough copies of the shard exists we are going to allocate it. This test
+     * verifies that even with different version, we treat different versions as a copy, and count them.
+     */
+    @Test
+    public void testEnoughCopiesFoundForAllocationWithDifferentVersion() {
+        MetaData metaData = MetaData.builder()
+                .put(IndexMetaData.builder(shardId.getIndex()).settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(2))
+                .build();
+        RoutingTable routingTable = RoutingTable.builder()
+                .addAsRecovery(metaData.index(shardId.getIndex()))
+                .build();
+        ClusterState state = ClusterState.builder(org.elasticsearch.cluster.ClusterName.DEFAULT)
+                .metaData(metaData)
+                .routingTable(routingTable)
+                .nodes(DiscoveryNodes.builder().put(node1).put(node2).put(node3)).build();
+
+        RoutingAllocation allocation = new RoutingAllocation(yesAllocationDeciders(), state.routingNodes(), state.nodes(), null);
+        boolean changed = testAllocator.allocateUnassigned(allocation);
+        assertThat(changed, equalTo(false));
+        assertThat(allocation.routingNodes().unassigned().ignored().size(), equalTo(1));
+        assertThat(allocation.routingNodes().unassigned().ignored().get(0).shardId(), equalTo(shardId));
+        assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.UNASSIGNED).size(), equalTo(2)); // replicas
+
+        testAllocator.addData(node1, 1);
+        allocation = new RoutingAllocation(yesAllocationDeciders(), state.routingNodes(), state.nodes(), null);
+        changed = testAllocator.allocateUnassigned(allocation);
+        assertThat(changed, equalTo(false));
+        assertThat(allocation.routingNodes().unassigned().ignored().size(), equalTo(1));
+        assertThat(allocation.routingNodes().unassigned().ignored().get(0).shardId(), equalTo(shardId));
+        assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.UNASSIGNED).size(), equalTo(2)); // replicas
+
+        testAllocator.addData(node2, 2);
+        allocation = new RoutingAllocation(yesAllocationDeciders(), state.routingNodes(), state.nodes(), null);
+        changed = testAllocator.allocateUnassigned(allocation);
+        assertThat(changed, equalTo(true));
+        assertThat(allocation.routingNodes().unassigned().ignored().size(), equalTo(0));
+        assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.UNASSIGNED).size(), equalTo(2)); // replicas
+        assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(1));
+        assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).currentNodeId(), equalTo(node2.id()));
+    }
+
+    private RoutingAllocation routingAllocationWithOnePrimaryNoReplicas(AllocationDeciders deciders) {
+        MetaData metaData = MetaData.builder()
+                .put(IndexMetaData.builder(shardId.getIndex()).settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(0))
+                .build();
+        RoutingTable routingTable = RoutingTable.builder()
+                .addAsRecovery(metaData.index(shardId.getIndex()))
+                .build();
+        ClusterState state = ClusterState.builder(org.elasticsearch.cluster.ClusterName.DEFAULT)
+                .metaData(metaData)
+                .routingTable(routingTable)
+                .nodes(DiscoveryNodes.builder().put(node1).put(node2).put(node3)).build();
+        return new RoutingAllocation(deciders, state.routingNodes(), state.nodes(), null);
+    }
+
+    class TestAllocator extends PrimaryShardAllocator {
+
+        private Map<DiscoveryNode, TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> data;
+
+        public TestAllocator() {
+            super(Settings.EMPTY);
+        }
+
+        public TestAllocator clear() {
+            data = null;
+            return this;
+        }
+
+        public TestAllocator addData(DiscoveryNode node, long version) {
+            return addData(node, version, null);
+        }
+
+        public TestAllocator addData(DiscoveryNode node, long version, @Nullable Throwable storeException) {
+            if (data == null) {
+                data = new HashMap<>();
+            }
+            data.put(node, new TransportNodesListGatewayStartedShards.NodeGatewayStartedShards(node, version, storeException));
+            return this;
+        }
+
+        @Override
+        protected AsyncShardFetch.FetchResult<TransportNodesListGatewayStartedShards.NodeGatewayStartedShards> fetchData(ShardRouting shard, RoutingAllocation allocation) {
+            return new AsyncShardFetch.FetchResult<>(shardId, data, Collections.<String>emptySet(), Collections.<String>emptySet());
+        }
+    }
+}
--- a/core/src/test/java/org/elasticsearch/gateway/PriorityComparatorTests.java
+++ b/core/src/test/java/org/elasticsearch/gateway/PriorityComparatorTests.java
@ -30,7 +30,7 @@ import java.util.Map;
 public class PriorityComparatorTests extends ElasticsearchTestCase {

    public void testPriorityComparatorSort() {
-        RoutingNodes.UnassignedShards shards = new RoutingNodes.UnassignedShards();
+        RoutingNodes.UnassignedShards shards = new RoutingNodes.UnassignedShards((RoutingNodes) null);
        int numIndices = randomIntBetween(3, 99);
        IndexMeta[] indices = new IndexMeta[numIndices];
        final Map<String, IndexMeta> map = new HashMap<>();
--- a/core/src/test/java/org/elasticsearch/index/store/CorruptedFileTest.java
+++ b/core/src/test/java/org/elasticsearch/index/store/CorruptedFileTest.java
@ -51,7 +51,7 @@ import org.elasticsearch.common.lucene.Lucene;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.ByteSizeUnit;
 import org.elasticsearch.discovery.Discovery;
-import org.elasticsearch.gateway.GatewayAllocator;
+import org.elasticsearch.gateway.PrimaryShardAllocator;
 import org.elasticsearch.index.settings.IndexSettings;
 import org.elasticsearch.index.shard.*;
 import org.elasticsearch.index.translog.TranslogService;
@ -514,7 +514,7 @@ public class CorruptedFileTest extends ElasticsearchIntegrationTest {
        internalCluster().ensureAtLeastNumDataNodes(2);

        assertAcked(prepareCreate("test").setSettings(Settings.builder()
-                        .put(GatewayAllocator.INDEX_RECOVERY_INITIAL_SHARDS, "one")
+                        .put(PrimaryShardAllocator.INDEX_RECOVERY_INITIAL_SHARDS, "one")
                        .put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, cluster().numDataNodes() - 1)
                        .put(MergePolicyConfig.INDEX_MERGE_ENABLED, false)
                        .put(MockFSDirectoryService.CHECK_INDEX_ON_CLOSE, false) // no checkindex - we corrupt shards on purpose
--- a/core/src/test/java/org/elasticsearch/indices/mapping/SimpleGetFieldMappingsTests.java
+++ b/core/src/test/java/org/elasticsearch/indices/mapping/SimpleGetFieldMappingsTests.java
@ -26,7 +26,6 @@ import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.common.xcontent.XContentFactory;
 import org.elasticsearch.test.ElasticsearchIntegrationTest;
 import org.hamcrest.Matchers;
-import org.junit.Test;

 import java.io.IOException;
 import java.util.Arrays;
@ -40,7 +39,7 @@ import static org.hamcrest.Matchers.*;

 public class SimpleGetFieldMappingsTests extends ElasticsearchIntegrationTest {
    
-    public void getMappingsWhereThereAreNone() {
+    public void testGetMappingsWhereThereAreNone() {
        createIndex("index");
        ensureYellow();
        GetFieldMappingsResponse response = client().admin().indices().prepareGetFieldMappings().get();
@ -57,7 +56,7 @@ public class SimpleGetFieldMappingsTests extends ElasticsearchIntegrationTest {
                .endObject().endObject().endObject();
    }
    
-    public void simpleGetFieldMappings() throws Exception {
+    public void testSimpleGetFieldMappings() throws Exception {

        assertAcked(prepareCreate("indexa")
                .addMapping("typeA", getMappingForType("typeA"))
@ -128,7 +127,7 @@ public class SimpleGetFieldMappingsTests extends ElasticsearchIntegrationTest {
    }

    @SuppressWarnings("unchecked")
-    public void simpleGetFieldMappingsWithDefaults() throws Exception {
+    public void testSimpleGetFieldMappingsWithDefaults() throws Exception {
        assertAcked(prepareCreate("test").addMapping("type", getMappingForType("type")));

        client().prepareIndex("test", "type", "1").setSource("num", 1).get();
@ -147,7 +146,7 @@ public class SimpleGetFieldMappingsTests extends ElasticsearchIntegrationTest {
    }

    //fix #6552
-    public void simpleGetFieldMappingsWithPretty() throws Exception {
+    public void testSimpleGetFieldMappingsWithPretty() throws Exception {
        assertAcked(prepareCreate("index").addMapping("type", getMappingForType("type")));
        Map<String, String> params = Maps.newHashMap();
        params.put("pretty", "true");
@ -179,7 +178,6 @@ public class SimpleGetFieldMappingsTests extends ElasticsearchIntegrationTest {

    }

-    @Test
    public void testGetFieldMappingsWithBlocks() throws Exception {
        assertAcked(prepareCreate("test")
                .addMapping("typeA", getMappingForType("typeA"))
--- a/core/src/test/java/org/elasticsearch/indices/store/IndicesStoreIntegrationTests.java
+++ b/core/src/test/java/org/elasticsearch/indices/store/IndicesStoreIntegrationTests.java
@ -47,6 +47,7 @@ import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.Arrays;
+import java.util.List;
 import java.util.concurrent.Future;
 import java.util.concurrent.TimeUnit;

@ -204,19 +205,15 @@ public class IndicesStoreIntegrationTests extends ElasticsearchIntegrationTest {

    @Test @Slow
    public void testShardActiveElseWhere() throws Exception {
-        boolean node1IsMasterEligible = randomBoolean();
-        boolean node2IsMasterEligible = !node1IsMasterEligible || randomBoolean();
-        Future<String> node_1_future = internalCluster().startNodeAsync(Settings.builder().put("node.master", node1IsMasterEligible).build());
-        Future<String> node_2_future = internalCluster().startNodeAsync(Settings.builder().put("node.master", node2IsMasterEligible).build());
-        final String node_1 = node_1_future.get();
-        final String node_2 = node_2_future.get();
-        final String node_1_id = internalCluster().getInstance(DiscoveryService.class, node_1).localNode().getId();
-        final String node_2_id = internalCluster().getInstance(DiscoveryService.class, node_2).localNode().getId();
+        List<String> nodes = internalCluster().startNodesAsync(2).get();

-        logger.debug("node {} (node_1) is {}master eligible", node_1, node1IsMasterEligible ? "" : "not ");
-        logger.debug("node {} (node_2) is {}master eligible", node_2, node2IsMasterEligible ? "" : "not ");
-        logger.debug("node {} became master", internalCluster().getMasterName());
-        final int numShards = scaledRandomIntBetween(2, 20);
+        final String masterNode = internalCluster().getMasterName();
+        final String nonMasterNode = nodes.get(0).equals(masterNode) ? nodes.get(1) : nodes.get(0);
+
+        final String masterId = internalCluster().clusterService(masterNode).localNode().id();
+        final String nonMasterId = internalCluster().clusterService(nonMasterNode).localNode().id();
+
+        final int numShards = scaledRandomIntBetween(2, 10);
        assertAcked(prepareCreate("test")
                        .setSettings(Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0).put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, numShards))
        );
@ -225,14 +222,14 @@ public class IndicesStoreIntegrationTests extends ElasticsearchIntegrationTest {
        waitNoPendingTasksOnAll();
        ClusterStateResponse stateResponse = client().admin().cluster().prepareState().get();

-        RoutingNode routingNode = stateResponse.getState().routingNodes().node(node_2_id);
+        RoutingNode routingNode = stateResponse.getState().routingNodes().node(nonMasterId);
        final int[] node2Shards = new int[routingNode.numberOfOwningShards()];
        int i = 0;
        for (ShardRouting shardRouting : routingNode) {
            node2Shards[i] = shardRouting.shardId().id();
            i++;
        }
-        logger.info("Node 2 has shards: {}", Arrays.toString(node2Shards));
+        logger.info("Node [{}] has shards: {}", nonMasterNode, Arrays.toString(node2Shards));
        final long shardVersions[] = new long[numShards];
        final int shardIds[] = new int[numShards];
        i = 0;
@ -241,17 +238,18 @@ public class IndicesStoreIntegrationTests extends ElasticsearchIntegrationTest {
            shardIds[i] = shardRouting.getId();
            i++;
        }
+
        // disable relocations when we do this, to make sure the shards are not relocated from node2
        // due to rebalancing, and delete its content
        client().admin().cluster().prepareUpdateSettings().setTransientSettings(settingsBuilder().put(EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE, EnableAllocationDecider.Rebalance.NONE)).get();
-        internalCluster().getInstance(ClusterService.class, node_2).submitStateUpdateTask("test", Priority.IMMEDIATE, new ClusterStateUpdateTask() {
+        internalCluster().getInstance(ClusterService.class, nonMasterNode).submitStateUpdateTask("test", Priority.IMMEDIATE, new ClusterStateUpdateTask() {
            @Override
            public ClusterState execute(ClusterState currentState) throws Exception {
                IndexRoutingTable.Builder indexRoutingTableBuilder = IndexRoutingTable.builder("test");
                for (int i = 0; i < numShards; i++) {
                    indexRoutingTableBuilder.addIndexShard(
                            new IndexShardRoutingTable.Builder(new ShardId("test", i), false)
-                                    .addShard(TestShardRouting.newShardRouting("test", i, node_1_id, true, ShardRoutingState.STARTED, shardVersions[shardIds[i]]))
+                                    .addShard(TestShardRouting.newShardRouting("test", i, masterId, true, ShardRoutingState.STARTED, shardVersions[shardIds[i]]))
                                    .build()
                    );
                }
@ -271,7 +269,7 @@ public class IndicesStoreIntegrationTests extends ElasticsearchIntegrationTest {
        waitNoPendingTasksOnAll();
        logger.info("Checking if shards aren't removed");
        for (int shard : node2Shards) {
-            assertTrue(waitForShardDeletion(node_2, "test", shard));
+            assertTrue(waitForShardDeletion(nonMasterNode, "test", shard));
        }
    }

--- a/core/src/test/java/org/elasticsearch/search/aggregations/pipeline/moving/avg/MovAvgTests.java
+++ b/core/src/test/java/org/elasticsearch/search/aggregations/pipeline/moving/avg/MovAvgTests.java
@ -50,7 +50,6 @@ import static org.elasticsearch.search.aggregations.AggregationBuilders.range;
 import static org.elasticsearch.search.aggregations.pipeline.PipelineAggregatorBuilders.derivative;
 import static org.elasticsearch.search.aggregations.pipeline.PipelineAggregatorBuilders.movingAvg;
 import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertSearchResponse;
-import static org.hamcrest.Matchers.closeTo;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.core.IsNull.notNullValue;
 import static org.hamcrest.core.IsNull.nullValue;
@ -92,7 +91,7 @@ public class MovAvgTests extends ElasticsearchIntegrationTest {
    }

    enum MetricTarget {
-        VALUE ("value"), COUNT("count");
+        VALUE ("value"), COUNT("count"), METRIC("metric");

        private final String name;

@ -1326,8 +1325,8 @@ public class MovAvgTests extends ElasticsearchIntegrationTest {
            assertThat("[_count] movavg should be NaN, but is ["+countMovAvg.value()+"] instead", countMovAvg.value(), equalTo(Double.NaN));
        } else {
            assertThat("[_count] movavg is null", countMovAvg, notNullValue());
-            assertThat("[_count] movavg does not match expected ["+countMovAvg.value()+" vs "+expectedCount+"]",
-                    countMovAvg.value(), closeTo(expectedCount, 0.1));
+            assertTrue("[_count] movavg does not match expected [" + countMovAvg.value() + " vs " + expectedCount + "]",
+                    nearlyEqual(countMovAvg.value(), expectedCount, 0.1));
        }

        // This is a gap bucket
@ -1338,8 +1337,34 @@ public class MovAvgTests extends ElasticsearchIntegrationTest {
            assertThat("[value] movavg should be NaN, but is ["+valuesMovAvg.value()+"] instead", valuesMovAvg.value(), equalTo(Double.NaN));
        } else {
            assertThat("[value] movavg is null", valuesMovAvg, notNullValue());
-            assertThat("[value] movavg does not match expected ["+valuesMovAvg.value()+" vs "+expectedValue+"]",
-                    valuesMovAvg.value(), closeTo(expectedValue, 0.1));
+            assertTrue("[value] movavg does not match expected [" + valuesMovAvg.value() + " vs " + expectedValue + "]",
+                    nearlyEqual(valuesMovAvg.value(), expectedValue, 0.1));
+        }
+    }
+
+    /**
+     * Better floating point comparisons courtesy of https://github.com/brazzy/floating-point-gui.de
+     *
+     * Snippet adapted to use doubles instead of floats
+     *
+     * @param a
+     * @param b
+     * @param epsilon
+     * @return
+     */
+    private static boolean nearlyEqual(double a, double b, double epsilon) {
+        final double absA = Math.abs(a);
+        final double absB = Math.abs(b);
+        final double diff = Math.abs(a - b);
+
+        if (a == b) { // shortcut, handles infinities
+            return true;
+        } else if (a == 0 || b == 0 || diff < Double.MIN_NORMAL) {
+            // a or b is zero or both are extremely close to it
+            // relative error is less meaningful here
+            return diff < (epsilon * Double.MIN_NORMAL);
+        } else { // use relative error
+            return diff / Math.min((absA + absB), Double.MAX_VALUE) < epsilon;
        }
    }

--- a/core/src/test/java/org/elasticsearch/search/stats/SearchStatsTests.java
+++ b/core/src/test/java/org/elasticsearch/search/stats/SearchStatsTests.java
@ -21,8 +21,6 @@ package org.elasticsearch.search.stats;

 import org.elasticsearch.action.admin.cluster.node.stats.NodeStats;
 import org.elasticsearch.action.admin.cluster.node.stats.NodesStatsResponse;
-import org.elasticsearch.action.admin.cluster.stats.ClusterStatsRequest;
-import org.elasticsearch.action.admin.cluster.stats.ClusterStatsRequestBuilder;
 import org.elasticsearch.action.admin.indices.stats.IndicesStatsResponse;
 import org.elasticsearch.action.search.SearchResponse;
 import org.elasticsearch.action.search.SearchType;
@ -35,14 +33,11 @@ import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.index.query.QueryBuilders;
 import org.elasticsearch.index.search.stats.SearchStats.Stats;
 import org.elasticsearch.script.Script;
-import org.elasticsearch.search.SearchHit;
 import org.elasticsearch.test.ElasticsearchIntegrationTest;
 import org.junit.Test;

 import java.util.HashSet;
-import java.util.Iterator;
 import java.util.Set;
-import java.util.concurrent.TimeUnit;

 import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_REPLICAS;
 import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS;
--- a/core/src/test/java/org/elasticsearch/test/ElasticsearchAllocationTestCase.java
+++ b/core/src/test/java/org/elasticsearch/test/ElasticsearchAllocationTestCase.java
@ -23,13 +23,16 @@ import org.elasticsearch.Version;
 import org.elasticsearch.cluster.ClusterInfoService;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.node.DiscoveryNode;
+import org.elasticsearch.cluster.routing.RoutingNode;
 import org.elasticsearch.cluster.routing.ShardRouting;
 import org.elasticsearch.cluster.routing.RoutingTable;
 import org.elasticsearch.cluster.routing.allocation.AllocationService;
+import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
 import org.elasticsearch.cluster.routing.allocation.allocator.ShardsAllocators;
 import org.elasticsearch.cluster.routing.allocation.decider.AllocationDecider;
 import org.elasticsearch.cluster.routing.allocation.decider.AllocationDeciders;
 import org.elasticsearch.cluster.routing.allocation.decider.AllocationDecidersModule;
+import org.elasticsearch.cluster.routing.allocation.decider.Decision;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.transport.DummyTransportAddress;
 import org.elasticsearch.common.transport.TransportAddress;
@ -121,4 +124,41 @@ public abstract class ElasticsearchAllocationTestCase extends ElasticsearchTestC
        RoutingTable routingTable = strategy.applyStartedShards(clusterState, newArrayList(initializingShards.get(randomInt(initializingShards.size() - 1)))).routingTable();
        return ClusterState.builder(clusterState).routingTable(routingTable).build();
    }
+
+    public static AllocationDeciders yesAllocationDeciders() {
+        return new AllocationDeciders(Settings.EMPTY, new AllocationDecider[] {new TestAllocateDecision(Decision.YES)});
+    }
+
+    public static AllocationDeciders noAllocationDeciders() {
+        return new AllocationDeciders(Settings.EMPTY, new AllocationDecider[] {new TestAllocateDecision(Decision.NO)});
+    }
+
+    public static AllocationDeciders throttleAllocationDeciders() {
+        return new AllocationDeciders(Settings.EMPTY, new AllocationDecider[] {new TestAllocateDecision(Decision.THROTTLE)});
+    }
+
+    static class TestAllocateDecision extends AllocationDecider {
+
+        private final Decision decision;
+
+        public TestAllocateDecision(Decision decision) {
+            super(Settings.EMPTY);
+            this.decision = decision;
+        }
+
+        @Override
+        public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, RoutingAllocation allocation) {
+            return decision;
+        }
+
+        @Override
+        public Decision canAllocate(ShardRouting shardRouting, RoutingAllocation allocation) {
+            return decision;
+        }
+
+        @Override
+        public Decision canAllocate(RoutingNode node, RoutingAllocation allocation) {
+            return decision;
+        }
+    }
 }
--- a/core/src/test/java/org/elasticsearch/test/TestSearchContext.java
+++ b/core/src/test/java/org/elasticsearch/test/TestSearchContext.java
@ -82,6 +82,8 @@ public class TestSearchContext extends SearchContext {
    private String[] types;
    private SearchContextAggregations aggregations;

+    private final long originNanoTime = System.nanoTime();
+
    public TestSearchContext(ThreadPool threadPool,PageCacheRecycler pageCacheRecycler, BigArrays bigArrays, IndexService indexService, QueryCache filterCache, IndexFieldDataService indexFieldDataService) {
        super(ParseFieldMatcher.STRICT);
        this.pageCacheRecycler = pageCacheRecycler;
@ -170,6 +172,11 @@ public class TestSearchContext extends SearchContext {
        return null;
    }

+    @Override
+    public long getOriginNanoTime() {
+        return originNanoTime;
+    }
+
    @Override
    protected long nowInMillisImpl() {
        return 0;
--- a/dev-tools/src/main/resources/ant/fixup-failsafe-summary.xslt
+++ b/dev-tools/src/main/resources/ant/fixup-failsafe-summary.xslt
@ -1,17 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!-- remove this when junit4 summary format is fixed -->
-<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
-  <xsl:output method="xml" version="1.0" encoding="UTF-8" indent="yes"/>
-  <xsl:template match="/">
-    <failsafe-summary>
-      <xsl:attribute name="timeout">
-        <xsl:value-of select="failsafe-summary/@timeout"/>
-      </xsl:attribute>
-      <completed><xsl:value-of select="failsafe-summary/@completed"/></completed>
-      <errors><xsl:value-of select="failsafe-summary/@errors"/></errors>
-      <failures><xsl:value-of select="failsafe-summary/@failures"/></failures>
-      <skipped><xsl:value-of select="failsafe-summary/@skipped"/></skipped>
-      <failureMessage><xsl:value-of select="failsafe-summary/@failureMessage"/></failureMessage>
-    </failsafe-summary>
-  </xsl:template>
-</xsl:stylesheet>
--- a/dev-tools/src/main/resources/ant/integration-tests.xml
+++ b/dev-tools/src/main/resources/ant/integration-tests.xml
@ -24,6 +24,7 @@
      <attribute name="dir"/>
      <attribute name="args"/>
      <attribute name="spawn" default="false"/>
+      <element name="nested" optional="true"/>
    <sequential>
      <local name="failonerror"/>
      <condition property="failonerror">
@ -34,11 +35,13 @@
        <arg value="/c"/>
        <arg value="@{dir}/@{script}.bat"/>
        <arg line="@{args}"/>
+        <nested/>
      </exec>

      <exec executable="sh" osfamily="unix" dir="@{dir}" failonerror="${failonerror}" spawn="@{spawn}">
        <arg value="@{dir}/@{script}"/>
        <arg line="@{args}"/>
+        <nested/>
      </exec>
    </sequential>
  </macrodef>
@ -134,6 +137,21 @@
    </sequential>
  </target>

+  <!-- run elasticsearch in the foreground (for debugging etc) -->
+  <!-- TODO: doesn't belong here, but we will figure it out -->
+  <target name="start-foreground" depends="stop-external-cluster">
+      <delete dir="${integ.scratch}"/>
+      <unzip src="${project.build.directory}/releases/${project.artifactId}-${project.version}.zip" dest="${integ.scratch}"/>
+      <local name="home"/>
+      <property name="home" location="${integ.scratch}/elasticsearch-${elasticsearch.version}"/>
+      <run-script dir="${home}" script="bin/elasticsearch" spawn="false"
+                  args="${integ.args} -Des.path.repo=${home}/repo">
+        <nested>
+          <env key="JAVA_OPTS" value="-agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=8000"/>
+        </nested>
+      </run-script>
+  </target>
+
  <!-- unzip core release artifact and start ES -->
  <target name="start-external-cluster" depends="setup-workspace" unless="${shouldskip}">
    <startup-elasticsearch/>
@ -165,11 +183,4 @@
    <delete file="${integ.pidfile}"/>
  </target>

-  <!-- TODO: remove this and the xslt when junit4 is fixed -->
-  <target name="fixup-failsafe-summary" unless="${shouldskip}">
-    <xslt in="${project.build.directory}/failsafe-reports/failsafe-summary-buggy.xml"
-          out="${project.build.directory}/failsafe-reports/failsafe-summary.xml"
-          style="${elasticsearch.tools.directory}/ant/fixup-failsafe-summary.xslt"/>
-  </target>
-
 </project>
--- a/docs/reference/aggregations/bucket/datehistogram-aggregation.asciidoc
+++ b/docs/reference/aggregations/bucket/datehistogram-aggregation.asciidoc
@ -50,9 +50,10 @@ See <<time-units>> for accepted abbreviations.
 By default, times are stored as UTC milliseconds since the epoch. Thus, all computation and "bucketing" / "rounding" is
 done on UTC. It is possible to provide a time zone value, which will cause all bucket
 computations to take place in the specified zone. The time returned for each bucket/entry is milliseconds since the
-epoch in UTC. The parameters is called `time_zone`. It accepts either a numeric value for the hours offset, for example:
-`"time_zone" : -2`. It also accepts a format of hours and minutes, like `"time_zone" : "-02:30"`.
-Another option is to provide a time zone accepted as one of the values listed here.
+epoch in UTC. The parameters is called `time_zone`. It accepts either a ISO 8601 UTC offset, or a timezone id.
+A UTC offset has the form of a `+` or `-`, followed by two digit hour, followed by `:`, followed by two digit minutes.
+For example, `+01:00` represents 1 hour ahead of UTC. A timezone id is the identifier for a TZ database. For example,
+Pacific time is represented as `America\Los_Angeles`.

 Lets take an example. For `2012-04-01T04:15:30Z` (UTC), with a `time_zone` of `"-08:00"`. For day interval, the actual time by
 applying the time zone and rounding falls under `2012-03-31`, so the returned value will be (in millis) of
--- a/docs/reference/cat/nodes.asciidoc
+++ b/docs/reference/cat/nodes.asciidoc
@ -182,6 +182,9 @@ phase operations |0
 phase |43ms
 |`search.query_total` |`sqto`, `searchFetchTotal` |No |Number of query
 operations |9
+|`search.scroll_current` |`scc`, `searchScrollCurrent` |No |Open scroll contexts |2
+|`search.scroll_time` |`scti`, `searchScrollTime` |No |Time scroll contexts held open|2m
+|`search.scroll_total` |`scto`, `searchScrollTotal` |No |Completed scroll contexts |1
 |`segments.count` |`sc`, `segmentsCount` |No |Number of segments |4
 |`segments.memory` |`sm`, `segmentsMemory` |No |Memory used by
 segments |1.4kb
--- a/docs/reference/index-modules/mapper.asciidoc
+++ b/docs/reference/index-modules/mapper.asciidoc
@ -52,7 +52,7 @@ creating a new index.
 [float]
 === Mapper settings

-`index.mapper.dynamic` (_static_)::
+`index.mapper.dynamic` (_dynamic_)::

    Dynamic creation of mappings for unmapped types can be completely
    disabled by setting `index.mapper.dynamic` to `false`.
--- a/docs/reference/indices.asciidoc
+++ b/docs/reference/indices.asciidoc
@ -51,6 +51,7 @@ and warmers.
 * <<indices-stats>>
 * <<indices-segments>>
 * <<indices-recovery>>
+* <<indices-shards-stores>>

 [float]
 [[status-management]]
@ -93,12 +94,16 @@ include::indices/templates.asciidoc[]

 include::indices/warmers.asciidoc[]

+include::indices/shadow-replicas.asciidoc[]
+
 include::indices/stats.asciidoc[]

 include::indices/segments.asciidoc[]

 include::indices/recovery.asciidoc[]

+include::indices/shard-stores.asciidoc[]
+
 include::indices/clearcache.asciidoc[]

 include::indices/flush.asciidoc[]
@ -107,7 +112,5 @@ include::indices/refresh.asciidoc[]

 include::indices/optimize.asciidoc[]

-include::indices/shadow-replicas.asciidoc[]
-
 include::indices/upgrade.asciidoc[]

--- a/docs/reference/mapping/fields.asciidoc
+++ b/docs/reference/mapping/fields.asciidoc
@ -1,27 +1,78 @@
 [[mapping-fields]]
-== Fields
+== Meta-Fields

-Each mapping has a number of fields associated with it
-which can be used to control how the document metadata
-(eg <<mapping-all-field>>) is indexed.
+Each document has metadata associated with it, such as the `_index`, mapping
+<<mapping-type-field,`_type`>>, and `_id` meta-fields.  The behaviour of some of these meta-fields
+can be customised when a mapping type is created.
+
+The meta-fields are:
+
+[horizontal]
+<<mapping-index-field,`_index`>>::
+
+    The index to which the document belongs.
+
+<<mapping-uid-field,`_uid`>>::
+
+    A composite field consisting of the `_type` and the `_id`.
+
+<<mapping-type-field,`_type`>>::
+
+    The document's <<all-mapping-types,mapping type>>.
+
+<<mapping-id-field,`_id`>>::
+
+    The document's ID.
+
+<<mapping-source-field,`_source`>>::
+
+    The original JSON representing the body of the document.
+
+<<mapping-all-field,`_all`>>::
+
+    A _catch-all_  field that indexes the values of all other fields.
+
+<<mapping-field-names-field,`_field_names`>>::
+
+    All fields in the document which contain non-null values.
+
+<<mapping-parent-field,`_parent`>>::
+
+    Used to create a parent-child relationship between two mapping types.
+
+<<mapping-routing-field,`_routing`>>::
+
+    A custom routing value which routes a document to a particular shard.
+
+<<mapping-size-field,`_size`>>::
+
+    The size of the `_source` field in bytes.
+
+<<mapping-timestamp-field,`_timestamp`>>::
+
+    A timestamp associated with the document, either specified manually or auto-generated.
+
+<<mapping-ttl-field,`_ttl`>>::
+
+    How long a document should live before it is automatically deleted.
+
+include::fields/index-field.asciidoc[]

 include::fields/uid-field.asciidoc[]

-include::fields/id-field.asciidoc[]
-
 include::fields/type-field.asciidoc[]

+include::fields/id-field.asciidoc[]
+
 include::fields/source-field.asciidoc[]

 include::fields/all-field.asciidoc[]

-include::fields/parent-field.asciidoc[]
-
 include::fields/field-names-field.asciidoc[]

-include::fields/routing-field.asciidoc[]
+include::fields/parent-field.asciidoc[]

-include::fields/index-field.asciidoc[]
+include::fields/routing-field.asciidoc[]

 include::fields/size-field.asciidoc[]

--- a/docs/reference/mapping/fields/all-field.asciidoc
+++ b/docs/reference/mapping/fields/all-field.asciidoc
@ -1,78 +1,416 @@
 [[mapping-all-field]]
-=== `_all`
+=== `_all` field

-The idea of the `_all` field is that it includes the text of one or more
-other fields within the document indexed. It can come very handy
-especially for search requests, where we want to execute a search query
-against the content of a document, without knowing which fields to
-search on. This comes at the expense of CPU cycles and index size.
+The `_all` field is a special _catch-all_ field which concatenates the values
+of all of the other fields into one big string, which is then
+<<analysis,analyzed>> and indexed, but not stored.  This means that it can be
+searched, but not retrieved.

-The `_all` fields can be completely disabled. Explicit field mappings and
-object mappings can be excluded / included in the `_all` field. By
-default, it is enabled and all fields are included in it for ease of
-use.
-
-When disabling the `_all` field, it is a good practice to set
-`index.query.default_field` to a different value (for example, if you
-have a main "message" field in your data, set it to `message`).
-
-One of the nice features of the `_all` field is that it takes into
-account specific fields boost levels. Meaning that if a title field is
-boosted more than content, the title (part) in the `_all` field will
-mean more than the content (part) in the `_all` field.
-
-Here is a sample mapping:
+The `_all` field allows you to search for values in documents without knowing
+which field contains the value.  This makes it a useful option when getting
+started with a new dataset. For instance:

 [source,js]
--------------------------------------------------
+--------------------------------
+PUT my_index/user/1 <1>
 {
-    "person" : {
-        "_all" : {"enabled" : true},
-        "properties" : {
-            "name" : {
-                "type" : "object",
-                "dynamic" : false,
-                "properties" : {
-                    "first" : {"type" : "string", "store" : true , "include_in_all" : false},
-                    "last" : {"type" : "string", "index" : "not_analyzed"}
-                }
-            },
-            "address" : {
-                "type" : "object",
-                "include_in_all" : false,
-                "properties" : {
-                    "first" : {
-                        "properties" : {
-                            "location" : {"type" : "string", "store" : true}
-                        }
-                    },
-                    "last" : {
-                        "properties" : {
-                            "location" : {"type" : "string"}
-                        }
-                    }
-                }
-            },
-            "simple1" : {"type" : "long", "include_in_all" : true},
-            "simple2" : {"type" : "long", "include_in_all" : false}
-        }
-    }
+  "first_name":    "John",
+  "last_name":     "Smith",
+  "date_of_birth": "1970-10-24"
 }
--------------------------------------------------

-The `_all` fields allows for `store`, `term_vector` and `analyzer` (with
-specific `analyzer` and `search_analyzer`) to be set.
+GET my_index/_search
+{
+  "query": {
+    "match": {
+      "_all": "john smith 1970"
+    }
+  }
+}
+--------------------------------
+// AUTOSENSE
+<1> The `_all` field will contain the terms: [ `"john"`, `"smith"`, `"1970"`, `"10"`, `"24"` ]

-[float]
-[[highlighting]]
-==== Highlighting
+[NOTE]
+.All values treated as strings
+=============================================================================
+
+The `date_of_birth` field in the above example is recognised as a `date` field
+and so will index a single term representing `1970-10-24 00:00:00 UTC`. The
+`_all` field, however, treats all values as strings, so the date value is
+indexed as the three string terms: `"1970"`, `"24"`, `"10"`.
+
+It is important to note that the `_all` field combines the original values
+from each field as a string. It does not combine the _terms_ from each field.
+
+=============================================================================
+
+The `_all` field is just a <<string,`string`>> field, and accepts the same
+parameters that  other string fields accept, including `analyzer`,
+`term_vectors`, `index_options`, and `store`.
+
+The `_all` field can be useful, especially when exploring new data using
+simple filtering.  However, by concatenating field values into one big string,
+the `_all` field loses the distinction between short fields (more relevant)
+and long fields (less relevant). For use cases where search relevance is
+important, it is better to query individual fields specifically.
+
+The `_all` field is not free: it requires extra CPU cycles and uses more disk
+space. If not needed, it can be completely <<disabling-all-field,disabled>> or
+customised on a <<include-in-all,per-field basis>>.
+
+[[querying-all-field]]
+==== Using the `_all` field in queries
+
+The <<query-dsl-query-string-query,`query_string`>> and
+<<query-dsl-simple-query-string-query,`simple_query_string`>> queries query
+the `_all` field by default, unless another field is specified:
+
+[source,js]
+--------------------------------
+GET _search
+{
+  "query": {
+    "query_string": {
+      "query": "john smith 1970"
+    }
+  }
+}
+--------------------------------
+// AUTOSENSE
+
+The same goes for the `?q=` parameter in <<search-uri-request, URI search
+requests>> (which is rewritten to a `query_string` query internally):
+
+[source,js]
+--------------------------------
+GET _search?q=john+smith+1970
+--------------------------------
+
+Other queries, such as the <<query-dsl-match-query,`match`>> and
+<<query-dsl-term-query,`term`>> queries require you to specify
+the `_all` field explicitly, as per the
+<<mapping-all-field,first example>>.
+
+[[disabling-all-field]]
+==== Disabling the `_all` field
+
+The `_all` field can be completely disabled per-type by setting `enabled` to
+`false`:
+
+[source,js]
+--------------------------------
+PUT my_index
+{
+  "mappings": {
+    "type_1": { <1>
+      "properties": {...}
+    },
+    "type_2": { <2>
+      "_all": {
+        "enabled": false
+      },
+      "properties": {...}
+    }
+  }
+}
+--------------------------------
+// AUTOSENSE
+
+<1> The `_all` field in `type_1` is enabled.
+<2> The `_all` field in `type_2` is completely disabled.
+
+If the `_all` field is disabled, then URI search requests and the
+`query_string` and `simple_query_string` queries will not be able to use it
+for queries (see <<querying-all-field>>).  You can configure them to use a
+different field with the `index.query.default_field` setting:
+
+[source,js]
+--------------------------------
+PUT my_index
+{
+  "mappings": {
+    "my_type": {
+      "_all": {
+        "enabled": false <1>
+      },
+      "properties": {
+        "content": {
+          "type": "string"
+        }
+      }
+    }
+  },
+  "settings": {
+    "index.query.default_field": "content" <2>
+  },
+}
+--------------------------------
+// AUTOSENSE
+
+<1> The `_all` field is disabled for the `my_type` type.
+<2> The `query_string` query will default to querying the `content` field in this index.
+
+[[include-in-all]]
+==== Including specific fields in `_all`
+
+Individual fields can be included or excluded from the `_all` field with the
+`include_in_all` setting, which defaults to `true`:
+
+[source,js]
+--------------------------------
+PUT my_index
+{
+  "mappings": {
+    "my_type": {
+      "properties": {
+        "title": { <1>
+          "type": "string"
+        }
+        "content": { <1>
+          "type": "string"
+        },
+        "date": { <2>
+          "type": "date",
+          "include_in_all": false
+        }
+      }
+    }
+  }
+}
+--------------------------------
+// AUTOSENSE
+
+<1> The `title` and `content` fields with be included in the `_all` field.
+<2> The `date` field will not be included in the `_all` field.
+
+The `include_in_all` parameter can also be set at the type level and on
+<<mapping-object-type,`object`>> or <<mapping-nested-type,`nested`>> fields,
+in which case all sub-fields inherit that setting.  For instance:
+
+[source,js]
+--------------------------------
+PUT my_index
+{
+  "mappings": {
+    "my_type": {
+      "include_in_all": false, <1>
+      "properties": {
+        "title":          { "type": "string" },
+        "author": {
+          "include_in_all": true, <2>
+          "properties": {
+            "first_name": { "type": "string" },
+            "last_name":  { "type": "string" }
+          }
+        },
+        "editor": {
+          "properties": {
+            "first_name": { "type": "string" }, <3>
+            "last_name":  { "type": "string", "include_in_all": true } <3>
+          }
+        }
+      }
+    }
+  }
+}
+--------------------------------
+// AUTOSENSE
+
+<1> All fields in `my_type` are excluded from `_all`.
+<2> The `author.first_name` and `author.last_name` fields are included in `_all`.
+<3> Only the `editor.last_name` field is included in `_all`.
+    The `editor.first_name` inherits the type-level setting and is excluded.
+
+[[all-field-and-boosting]]
+==== Index boosting and the `_all` field
+
+Individual fields can be _boosted_  at index time, with the `boost` parameter.
+The `_all` field takes these boosts into account:
+
+[source,js]
+--------------------------------
+PUT myindex
+{
+  "mappings": {
+    "mytype": {
+      "properties": {
+        "title": { <1>
+          "type": "string",
+          "boost": 2
+        },
+        "content": { <1>
+          "type": "string"
+        }
+      }
+    }
+  }
+}
+--------------------------------
+// AUTOSENSE
+
+<1> When querying the `_all` field, words that originated in the
+    `title` field are twice as relevant as words that originated in
+    the `content` field.
+
+WARNING: Using index-time boosting with the `_all` field has a significant
+impact on query performance. Usually the better solution is to query fields
+individually, with optional query time boosting.
+
+
+[[custom-all-fields]]
+==== Custom `_all` fields
+
+While there is only a single `_all` field per index, the <<copy-to,`copy_to`>>
+parameter allows the creation of multiple __custom `_all` fields__. For
+instance, `first_name` and `last_name` fields can be combined together into
+the `full_name` field:
+
+[source,js]
+--------------------------------
+PUT myindex
+{
+  "mappings": {
+    "mytype": {
+      "properties": {
+        "first_name": {
+          "type":    "string",
+          "copy_to": "full_name" <1>
+        },
+        "last_name": {
+          "type":    "string",
+          "copy_to": "full_name" <1>
+        },
+        "full_name": {
+          "type":    "string"
+        }
+      }
+    }
+  }
+}
+
+PUT myindex/mytype/1
+{
+  "first_name": "John",
+  "last_name": "Smith"
+}
+
+GET myindex/_search
+{
+  "query": {
+    "match": {
+      "full_name": "John Smith"
+    }
+  }
+}
+--------------------------------
+// AUTOSENSE
+
+<1> The `first_name` and `last_name` values are copied to the `full_name` field.
+
+[[highlighting-all-field]]
+==== Highlighting and the `_all` field
+
+A field can only be used for <<search-request-highlighting,highlighting>>  if
+the original string value is available, either from the
+<<mapping-source-field,`_source`>>  field or as a stored field.
+
+The `_all` field is not present in the `_source` field and it is not stored by
+default, and so cannot be highlighted. There are two options. Either
+<<all-field-store,store the `_all` field>> or highlight the
+<<all-highlight-fields,original fields>>.
+
+[[all-field-store]]
+===== Store the `_all` field
+
+If `store` is set to `true`, then the original field value is retrievable and
+can be highlighted:
+
+[source,js]
+--------------------------------
+PUT myindex
+{
+  "mappings": {
+    "mytype": {
+      "_all": {
+        "store": true
+      }
+    }
+  }
+}
+
+PUT myindex/mytype/1
+{
+  "first_name": "John",
+  "last_name": "Smith"
+}
+
+GET _search
+{
+  "query": {
+    "match": {
+      "_all": "John Smith"
+    }
+  },
+  "highlight": {
+    "fields": {
+      "_all": {}
+    }
+  }
+}
+--------------------------------
+// AUTOSENSE
+
+Of course, storing the `_all` field will use significantly more disk space
+and, because it is a combination of other fields, it may result in odd
+highlighting results.
+
+The `_all` field also accepts the `term_vector` and `index_options`
+parameters, allowing the use of the fast vector highlighter and the postings
+highlighter.
+
+[[all-highlight-fields]]
+===== Highlight original fields
+
+You can query the `_all` field, but use the original fields for highlighting as follows:
+
+[source,js]
+--------------------------------
+PUT myindex
+{
+  "mappings": {
+    "mytype": {
+      "_all": {}
+    }
+  }
+}
+
+PUT myindex/mytype/1
+{
+  "first_name": "John",
+  "last_name": "Smith"
+}
+
+GET _search
+{
+  "query": {
+    "match": {
+      "_all": "John Smith" <1>
+    }
+  },
+  "highlight": {
+    "fields": {
+      "*_name": { <2>
+        "require_field_match": "false"  <3>
+      }
+    }
+  }
+}
+--------------------------------
+// AUTOSENSE
+
+<1> The query inspects the `_all` field to find matching documents.
+<2> Highlighting is performed on the two name fields, which are available from the `_source`.
+<3> The query wasn't run against the name fields, so set `require_field_match` to `false`.

-For any field to allow
-<<search-request-highlighting,highlighting>> it has
-to be either stored or part of the `_source` field. By default the `_all`
-field does not qualify for either, so highlighting for it does not yield
-any data.

-Although it is possible to `store` the `_all` field, it is basically an
-aggregation of all fields, which means more data will be stored, and
-highlighting it might produce strange results.
--- a/docs/reference/mapping/fields/field-names-field.asciidoc
+++ b/docs/reference/mapping/fields/field-names-field.asciidoc
@ -1,6 +1,55 @@
 [[mapping-field-names-field]]
-=== `_field_names`
+=== `_field_names` field
+
+The `_field_names` field indexes the names of every field in a document that
+contains any value other than `null`.  This field is used by the
+<<query-dsl-exists-query,`exists`>> and <<query-dsl-missing-query,`missing`>>
+queries to find documents that either have or don't have any non-+null+ value
+for a particular field.
+
+The value of the `_field_name` field is accessible in queries, aggregations, and
+scripts:
+
+[source,js]
+--------------------------
+# Example documents
+PUT my_index/my_type/1
+{
+  "title": "This is a document"
+}
+
+PUT my_index/my_type/1
+{
+  "title": "This is another document",
+  "body": "This document has a body"
+}
+
+GET my_index/_search
+{
+  "query": {
+    "terms": {
+      "_field_names": [ "title" ] <1>
+    }
+  },
+  "aggs": {
+    "Field names": {
+      "terms": {
+        "field": "_field_names", <2>
+        "size": 10
+      }
+    }
+  },
+  "script_fields": {
+    "Field names": {
+      "script": "doc['_field_names']" <3>
+    }
+  }
+}
+
+--------------------------
+// AUTOSENSE
+
+<1> Querying on the `_field_names` field (also see the <<query-dsl-exists-query,`exists`>> and <<query-dsl-missing-query,`missing`>> queries)
+<2> Aggregating on the `_field_names` field
+<3> Accessing the `_field_names` field in scripts (inline scripts must be <<enable-dynamic-scripting,enabled>> for this example to work)

-The `_field_names` field indexes the field names of a document, which can later
-be used to search for documents based on the fields that they contain typically
-using the `exists` and `missing` filters.
--- a/docs/reference/mapping/fields/id-field.asciidoc
+++ b/docs/reference/mapping/fields/id-field.asciidoc
@ -1,11 +1,44 @@
 [[mapping-id-field]]
-=== `_id`
+=== `_id` field

-Each document indexed is associated with an id and a type. The `_id`
-field allows accessing only the id of a document.
+Each document indexed is associated with a <<mapping-type-field,`_type`>> (see
+<<all-mapping-types,Mapping Types>>) and an <<mapping-id-field,`_id`>>.  The
+`_id` field is not indexed as its value can be derived automatically from the
+<<mapping-uid-field,`_uid`>> field.

-Note, even though the `_id` is not indexed, all the APIs still work
-(since they work with the `_uid` field), as well as fetching by ids
-using `term`, `terms` or `prefix` queries/filters (including the
-specific `ids` query/filter).
+The value of the `_id` field is accessible in queries and scripts, but _not_
+in aggregations or when sorting, where the <<mapping-uid-field,`_uid`>> field
+should be used instead:
+
+[source,js]
+--------------------------
+# Example documents
+PUT my_index/my_type/1
+{
+  "text": "Document with ID 1"
+}
+
+PUT my_index/my_type/2
+{
+  "text": "Document with ID 2"
+}
+
+GET my_index/_search
+{
+  "query": {
+    "terms": {
+      "_id": [ "1", "2" ] <1>
+    }
+  },
+  "script_fields": {
+    "UID": {
+      "script": "doc['_id']" <2>
+    }
+  }
+}
+--------------------------
+// AUTOSENSE
+
+<1> Querying on the `_id` field (also see the <<query-dsl-ids-query,`ids` query>>)
+<2> Accessing the `_id` field in scripts (inline scripts must be <<enable-dynamic-scripting,enabled>> for this example to work)

--- a/docs/reference/mapping/fields/index-field.asciidoc
+++ b/docs/reference/mapping/fields/index-field.asciidoc
@ -1,7 +1,56 @@
 [[mapping-index-field]]
-=== `_index`
+=== `_index` field

 When performing queries across multiple indexes, it is sometimes desirable
 to add query clauses that are associated with documents of only certain
 indexes. The `_index` field allows matching on the index a document was
-indexed into.
+indexed into. Its value is accessible in queries, aggregations, scripts, and when sorting:
+
+[source,js]
+--------------------------
+# Example documents
+PUT index_1/my_type/1
+{
+  "text": "Document in index 1"
+}
+
+PUT index_2/my_type/2
+{
+  "text": "Document in index 2"
+}
+
+GET index_1,index_2/_search
+{
+  "query": {
+    "terms": {
+      "_index": ["index_1", "index_2"] <1>
+    }
+  },
+  "aggs": {
+    "indices": {
+      "terms": {
+        "field": "_index", <2>
+        "size": 10
+      }
+    }
+  },
+  "sort": [
+    {
+      "_index": { <3>
+        "order": "asc"
+      }
+    }
+  ],
+  "script_fields": {
+    "index_name": {
+      "script": "doc['_index']" <4>
+    }
+  }
+}
+--------------------------
+// AUTOSENSE
+
+<1> Querying on the `_index` field
+<2> Aggregating on the `_index` field
+<3> Sorting on the `_index` field
+<4> Accessing the `_index` field in scripts (inline scripts must be <<enable-dynamic-scripting,enabled>> for this example to work)
--- a/docs/reference/mapping/fields/parent-field.asciidoc
+++ b/docs/reference/mapping/fields/parent-field.asciidoc
@ -1,54 +1,165 @@
 [[mapping-parent-field]]
-=== `_parent`
+=== `_parent` field

-TIP: It is highly recommend to reindex all indices with `_parent` field created before version 2.x.
-     The reason for this is to gain from all the optimizations added with the 2.0 release.
+added[2.0.0,The parent-child implementation has been completely rewritten. It is advisable to reindex any 1.x indices which use parent-child to take advantage of the new optimizations]

-The parent field mapping is defined on a child mapping, and points to
-the parent type this child relates to. For example, in case of a `blog`
-type and a `blog_tag` type child document, the mapping for `blog_tag`
-should be:
+A parent-child relationship can be established between documents in the same
+index by making one mapping type the parent of another:

 [source,js]
 --------------------------------------------------
+PUT my_index
 {
-    "blog_tag" : {
-        "_parent" : {
-            "type" : "blog"
-        }
+  "mappings": {
+    "my_parent": {},
+    "my_child": {
+      "_parent": {
+        "type": "my_parent" <1>
+      }
    }
+  }
+}
+
+PUT my_index/my_parent/1 <2>
+{
+  "text": "This is a parent document"
+}
+
+PUT my_index/my_child/2?parent=1 <3>
+{
+  "text": "This is a child document"
+}
+
+PUT my_index/my_child/3?parent=1 <3>
+{
+  "text": "This is another child document"
+}
+
+GET my_index/my_parent/_search
+{
+  "query": {
+    "has_child": { <4>
+      "type": "my_child",
+      "query": {
+        "match": {
+          "text": "child document"
+        }
+      }
+    }
+  }
 }
 --------------------------------------------------
+// AUTOSENSE
+<1> The `my_parent` type is parent to the `my_child` type.
+<2> Index a parent document.
+<3> Index two child documents, specifying the parent document's ID.
+<4> Find all parent documents that have children which match the query.

-The mapping is automatically stored and indexed (meaning it can be
-searched on using the `_parent` field notation).

-==== Limitations
+See the <<query-dsl-has-child-query,`has_child`>> and
+<<query-dsl-has-parent-query,`has_parent`>> queries,
+the <<search-aggregations-bucket-children-aggregation,`children`>> aggregation,
+and <<parent-child-inner-hits,inner hits>> for more information.

-The `_parent.type` setting can only point to a type that doesn't exist yet.
-This means that a type can't become a parent type after is has been created.
+The value of the `_parent` field is accessible in queries, aggregations, scripts,
+and when sorting:

-The `parent.type` setting can't point to itself. This means self referential
-parent/child isn't supported.
+[source,js]
+--------------------------
+GET my_index/_search
+{
+  "query": {
+    "terms": {
+      "_parent": [ "1" ] <1>
+    }
+  },
+  "aggs": {
+    "parents": {
+      "terms": {
+        "field": "_parent", <2>
+        "size": 10
+      }
+    }
+  },
+  "sort": [
+    {
+      "_parent": { <3>
+        "order": "desc"
+      }
+    }
+  ],
+  "script_fields": {
+    "parent": {
+      "script": "doc['_parent']" <4>
+    }
+  }
+}
+--------------------------
+// AUTOSENSE
+
+<1> Querying on the `_parent` field (also see the <<query-dsl-has-parent-query,`has_parent` query>> and the <<query-dsl-has-child-query,`has_child` query>>)
+<2> Aggregating on the `_parent` field (also see the <<search-aggregations-bucket-children-aggregation,`children`>> aggregation)
+<3> Sorting on the `_parent` field
+<4> Accessing the `_parent` field in scripts (inline scripts must be <<enable-dynamic-scripting,enabled>> for this example to work)
+
+
+==== Parent-child restrictions
+
+* The parent and child types must be different -- parent-child relationships
+  cannot be established between documents of the same type.
+
+* The `_parent.type` setting can only point to a type that doesn't exist yet.
+  This means that a type cannot become a parent type after it is has been
+  created.
+
+* Parent and child documents must be indexed on the same shard.  The `parent`
+  ID is used as the <<mapping-routing-field,routing>> value for the child,
+  to ensure that the child is indexed on the same shard as the parent.
+  This means that the same `parent` value needs to be provided when
+  <<docs-get,getting>>, <<docs-delete,deleting>>, or <<docs-update,updating>>
+  a child document.

 ==== Global ordinals

-Parent-child uses <<global-ordinals,global ordinals>> to speed up joins and global ordinals need to be rebuilt after any change to a shard.
-The more parent id values are stored in a shard, the longer it takes to rebuild global ordinals for the `_parent` field.
+Parent-child uses <<global-ordinals,global ordinals>> to speed up joins.
+Global ordinals need to be rebuilt after any change to a shard. The more
+parent id values are stored in a shard, the longer it takes to rebuild the
+global ordinals for the `_parent` field.

-Global ordinals, by default, are built lazily: the first parent-child query or aggregation after a refresh will trigger building of global ordinals.
-This can introduce a significant latency spike for your users. You can use <<fielddata-loading,eager_global_ordinals>> to shift the cost of building global ordinals
-from query time to refresh time, by mapping the _parent field as follows:
-
-==== Memory usage
-
-The only on heap memory used by parent/child is the global ordinals for the `_parent` field.
-
-How much memory is used for the global ordianls for the `_parent` field in the fielddata cache
-can be checked via the <<indices-stats,indices stats>> or <<cluster-nodes-stats,nodes stats>>
-APIS, eg:
+Global ordinals, by default, are built lazily: the first parent-child query or
+aggregation after a refresh will trigger building of global ordinals. This can
+introduce a significant latency spike for your users. You can use
+<<fielddata-loading,eager_global_ordinals>> to shift the cost of building global
+ordinals from query time to refresh time, by mapping the `_parent` field as follows:

 [source,js]
 --------------------------------------------------
-curl -XGET "http://localhost:9200/_stats/fielddata?pretty&human&fielddata_fields=_parent"
+PUT my_index
+{
+  "mappings": {
+    "my_parent": {},
+    "my_child": {
+      "_parent": {
+        "type": "my_parent",
+        "fielddata": {
+          "loading": "eager_global_ordinals"
+        }
+      }
+    }
+  }
+}
 --------------------------------------------------
+// AUTOSENSE
+
+The amount of heap used by global ordinals can be checked as follows:
+
+[source,sh]
+--------------------------------------------------
+# Per-index
+GET _stats/fielddata?human&fields=_parent
+
+# Per-node per-index
+GET _nodes/stats/indices/fielddata?human&fields=_parent
+--------------------------------------------------
+// AUTOSENSE
+
--- a/docs/reference/mapping/fields/routing-field.asciidoc
+++ b/docs/reference/mapping/fields/routing-field.asciidoc
@ -1,22 +1,134 @@
 [[mapping-routing-field]]
-=== `_routing`
+=== `_routing` field

-The routing field allows to control the `_routing` aspect when indexing
-data and explicit routing control is required. It is stored and indexed.
+A document is routed to a particular shard in an index using the following
+formula:

-[float]
-==== required
+    shard_num = hash(_routing) % num_primary_shards

-Another aspect of the `_routing` mapping is the ability to define it as
-required by setting `required` to `true`. This is very important to set
-when using routing features, as it allows different APIs to make use of
-it. For example, an index operation will be rejected if no routing value
-has been provided.
+The default value used for `_routing` is the document's <<mapping-id-field,`_id`>>
+or the document's <<mapping-parent-field,`_parent`>> ID, if present.

-[float]
-==== id uniqueness
+Custom routing patterns can be implemented by specifying a custom `routing`
+value per document.  For instance:

-When indexing documents specifying a custom `_routing`, the uniqueness
-of the `_id` is not guaranteed throughout all the shards that the index
-is composed of. In fact, documents with the same `_id` might end up in
-different shards if indexed with different `_routing` values.
+[source,js]
+------------------------------
+PUT my_index/my_type/1?routing=user1 <1>
+{
+  "title": "This is a document"
+}
+
+GET my_index/my_type/1?routing=user1 <2>
+------------------------------
+// AUTOSENSE
+
+<1> This document uses `user1` as its routing value, instead of its ID.
+<2> The the same `routing` value needs to be provided when
+    <<docs-get,getting>>, <<docs-delete,deleting>>, or <<docs-update,updating>>
+    the document.
+
+The value of the `_routing` field is accessible in queries, aggregations, scripts,
+and when sorting:
+
+[source,js]
+--------------------------
+GET my_index/_search
+{
+  "query": {
+    "terms": {
+      "_routing": [ "user1" ] <1>
+    }
+  },
+  "aggs": {
+    "Routing values": {
+      "terms": {
+        "field": "_routing", <2>
+        "size": 10
+      }
+    }
+  },
+  "sort": [
+    {
+      "_routing": { <3>
+        "order": "desc"
+      }
+    }
+  ],
+  "script_fields": {
+    "Routing value": {
+      "script": "doc['_routing']" <4>
+    }
+  }
+}
+--------------------------
+// AUTOSENSE
+
+<1> Querying on the `_routing` field (also see the <<query-dsl-ids-query,`ids` query>>)
+<2> Aggregating on the `_routing` field
+<3> Sorting on the `_routing` field
+<4> Accessing the `_routing` field in scripts (inline scripts must be <<enable-dynamic-scripting,enabled>> for this example to work)
+
+
+==== Searching with custom routing
+
+Custom routing can reduce the impact of searches.  Instead of having to fan
+out a search request to all the shards in an index, the request can be sent to
+just the shard that matches the specific routing value (or values):
+
+[source,js]
+------------------------------
+GET my_index/_search?routing=user1,user2 <1>
+{
+  "query": {
+    "match": {
+      "title": "document"
+    }
+  }
+}
+------------------------------
+// AUTOSENSE
+
+<1> This search request will only be executed on the shards associated with the `user1` and `user2` routing values.
+
+
+==== Making a routing value required
+
+When using custom routing, it is important to provide the routing value
+whenever <<docs-index_,indexing>>, <<docs-get,getting>>,
+<<docs-delete,deleting>>, or <<docs-update,updating>> a document.
+
+Forgetting the routing value can lead to a document being indexed on more than
+one shard.  As a safeguard, the `_routing` field can be configured to make a
+custom `routing` value required for all CRUD operations:
+
+[source,js]
+------------------------------
+PUT my_index
+{
+  "mappings": {
+    "my_type": {
+      "_routing": {
+        "required": true <1>
+      }
+    }
+  }
+}
+
+PUT my_index/my_type/1 <2>
+{
+  "text": "No routing value provided"
+}
+------------------------------
+// AUTOSENSE
+<1> Routing is required for `my_type` documents.
+<2> This index request throws a `routing_missing_exception`.
+
+==== Unique IDs with custom routing
+
+When indexing documents specifying a custom `_routing`, the uniqueness of the
+`_id` is not guaranteed across all of the shards in the index. In fact,
+documents with the same `_id` might end up on different shards if indexed with
+different `_routing` values.
+
+It is up to the user to ensure that IDs are unique across the index.
--- a/docs/reference/mapping/fields/size-field.asciidoc
+++ b/docs/reference/mapping/fields/size-field.asciidoc
@ -1,15 +1,76 @@
 [[mapping-size-field]]
-=== `_size`
+=== `_size` field

-The `_size` field allows to automatically index the size of the original
-`_source` indexed. By default, it's disabled. In order to enable it, set
-the mapping to:
+The `_size` field, when enabled, indexes the size in bytes of the original
+<<mapping-source-field,`_source`>>. In order to enable it, set
+the mapping as follows:

 [source,js]
--------------------------------------------------
+--------------------------
+PUT my_index
 {
-    "tweet" : {
-        "_size" : {"enabled" : true}
+  "mappings": {
+    "my_type": {
+      "_size": {
+        "enabled": true
+      }
    }
+  }
 }
--------------------------------------------------
+--------------------------
+// AUTOSENSE
+
+The value of the `_size` field is accessible in queries, aggregations, scripts,
+and when sorting:
+
+[source,js]
+--------------------------
+# Example documents
+PUT my_index/my_type/1
+{
+  "text": "This is a document"
+}
+
+PUT my_index/my_type/2
+{
+  "text": "This is another document"
+}
+
+GET my_index/_search
+{
+  "query": {
+    "range": {
+      "_size": { <1>
+        "gt": 10
+      }
+    }
+  },
+  "aggs": {
+    "Sizes": {
+      "terms": {
+        "field": "_size", <2>
+        "size": 10
+      }
+    }
+  },
+  "sort": [
+    {
+      "_size": { <3>
+        "order": "desc"
+      }
+    }
+  ],
+  "script_fields": {
+    "Size": {
+      "script": "doc['_size']"  <4>
+    }
+  }
+}
+--------------------------
+// AUTOSENSE
+
+<1> Querying on the `_size` field
+<2> Aggregating on the `_size` field
+<3> Sorting on the `_size` field
+<4> Accessing the `_size` field in scripts (inline scripts must be <<enable-dynamic-scripting,enabled>> for this example to work)
+
--- a/docs/reference/mapping/fields/source-field.asciidoc
+++ b/docs/reference/mapping/fields/source-field.asciidoc
@ -1,12 +1,12 @@
 [[mapping-source-field]]
-=== `_source`
+=== `_source` field

-The `_source` field is an automatically generated field that stores the actual
-JSON that was used as the indexed document. It is not indexed (searchable),
-just stored. When executing "fetch" requests, like <<docs-get,get>> or
-<<search-search,search>>, the `_source` field is returned by default.
+The `_source` field contains the original JSON document body that was passed
+at index time.  The `_source` field itself is not indexed (and thus is not
+searchable), but it is stored so that it can be returned when executing
+_fetch_ requests, like <<docs-get,get>> or <<search-search,search>>.

-==== Disabling source
+==== Disabling the `_source` field

 Though very handy to have around, the source field does incur storage overhead
 within the index. For this reason, it can be disabled as follows:
@ -26,7 +26,7 @@ PUT tweets
 // AUTOSENSE

 [WARNING]
-.Think before disabling the source field
+.Think before disabling the `_source` field
 ==================================================

 Users often disable the `_source` field without thinking about the
@ -46,11 +46,11 @@ available then a number of features are not supported:

 * Potentially in the future, the ability to repair index corruption
  automatically.
-
-If disk space is a concern, rather increase the
-<<index-codec,compression level>> instead of disabling the `_source`.
 ==================================================

+TIP: If disk space is a concern, rather increase the
+<<index-codec,compression level>> instead of disabling the `_source`.
+
 .The metrics use case
 **************************************************

@ -69,12 +69,20 @@ metrics case.


 [[include-exclude]]
-==== Including / Excluding fields from source
+==== Including / Excluding fields from `_source`

 An expert-only feature is the ability to prune the contents of the `_source`
 field after the document has been indexed, but before the `_source` field is
-stored.  The `includes`/`excludes` parameters (which also accept wildcards)
-can be used as follows:
+stored.
+
+WARNING: Removing fields from the `_source` has similar downsides to disabling
+`_source`, especially the fact that you cannot reindex documents from one
+Elasticsearch index to another. Consider using
+<<search-request-source-filtering,source filtering>> or a
+<<mapping-transform,transform script>> instead.
+
+The `includes`/`excludes` parameters (which also accept wildcards) can be used
+as follows:

 [source,js]
 --------------------------------------------------
@ -126,8 +134,3 @@ GET logs/event/_search
 <1> These fields will be removed from the stored `_source` field.
 <2> We can still search on this field, even though it is not in the stored `_source`.

-WARNING: Removing fields from the `_source` has similar downsides to disabling
-`_source`, especially the fact that you cannot reindex documents from one
-Elasticsearch index to another. Consider using
-<<search-request-source-filtering,source filtering>> or a
-<<mapping-transform,transform script>> instead.
--- a/docs/reference/mapping/fields/timestamp-field.asciidoc
+++ b/docs/reference/mapping/fields/timestamp-field.asciidoc
@ -1,90 +1,94 @@
 [[mapping-timestamp-field]]
-=== `_timestamp`
+=== `_timestamp` field

-The `_timestamp` field allows to automatically index the timestamp of a
-document. If it is not provided it will be automatically set
-to a <<mapping-timestamp-field-default,default date>>.
-
-[float]
-==== enabled
-
-By default it is disabled. In order to enable it, the following mapping
-should be defined:
+The `_timestamp` field, when enabled, allows a timestamp to be indexed and
+stored with a document. The timestamp may be specified manually, generated
+automatically, or set to a default value:

 [source,js]
--------------------------------------------------
+------------------------------------
+PUT my_index
 {
-    "tweet" : {
-        "_timestamp" : { "enabled" : true }
+  "mappings": {
+    "my_type": {
+      "_timestamp": { <1>
+        "enabled": true
+      }
    }
+  }
 }
--------------------------------------------------

-[float]
-[[mapping-timestamp-field-format]]
-==== format
+PUT my_index/my_type/1?timestamp=2015-01-01 <2>
+{ "text": "Timestamp as a formatted date" }

-You can define the <<mapping-date-format,date
-format>> used to parse the provided timestamp value. For example:
+PUT my_index/my_type/2?timestamp=1420070400000 <3>
+{ "text": "Timestamp as milliseconds since the epoch" }
+
+PUT my_index/my_type/3 <4>
+{ "text": "Autogenerated timestamp set to now()" }
+
+------------------------------------
+// AUTOSENSE
+
+<1> Enable the `_timestamp` field with default settings.
+<2> Set the timestamp manually with a formatted date.
+<3> Set the timestamp with milliseconds since the epoch.
+<4> Auto-generates a timestamp with <<date-math,now()>>.
+
+The behaviour of the `_timestamp` field can be configured with the following parameters:
+
+`default`::
+
+    A default value to be used if none is provided.  Defaults to <<date-math,now()>>.
+
+`format`::
+
+    The <<mapping-date-format,date format>> (or formats) to use when parsing timestamps.  Defaults to `epoch_millis||strictDateOptionalTime`.
+
+`ignore_missing`::
+
+    If `true` (default), replace missing timestamps with the `default` value.  If `false`, throw an exception.
+
+
+The value of the `_timestamp` field is accessible in queries, aggregations, scripts,
+and when sorting:

 [source,js]
--------------------------------------------------
+--------------------------
+GET my_index/_search
 {
-    "tweet" : {
-        "_timestamp" : {
-            "enabled" : true,
-            "path" : "post_date",
-            "format" : "YYYY-MM-dd"
-        }
+  "query": {
+    "range": {
+      "_timestamp": { <1>
+        "gte": "2015-01-01"
+      }
    }
-}
--------------------------------------------------
-
-Note, the default format is `epoch_millis||strictDateOptionalTime`. The timestamp value will
-first be parsed as a number and if it fails the format will be tried.
-
-[float]
-[[mapping-timestamp-field-default]]
-==== default
-
-You can define a default value for when timestamp is not provided
-within the index request or in the `_source` document.
-
-By default, the default value is `now` which means the date the document was processed by the indexing chain.
-
-You can reject documents which do not provide a `timestamp` value by setting `ignore_missing` to false (default to `true`):
-
-[source,js]
--------------------------------------------------
-{
-    "tweet" : {
-        "_timestamp" : {
-            "enabled" : true,
-            "ignore_missing" : false
-        }
+  },
+  "aggs": {
+    "Timestamps": {
+      "terms": {
+        "field": "_timestamp", <2>
+        "size": 10
+      }
    }
-}
--------------------------------------------------
-
-You can also set the default value to any date respecting <<mapping-timestamp-field-format,timestamp format>>:
-
-[source,js]
--------------------------------------------------
-{
-    "tweet" : {
-        "_timestamp" : {
-            "enabled" : true,
-            "format" : "YYYY-MM-dd",
-            "default" : "1970-01-01"
-        }
+  },
+  "sort": [
+    {
+      "_timestamp": { <3>
+        "order": "desc"
+      }
    }
+  ],
+  "script_fields": {
+    "Timestamp": {
+      "script": "doc['_timestamp']" <4>
+    }
+  }
 }
--------------------------------------------------
-
-If you don't provide any timestamp value, _timestamp will be set to this default value.
-
-In elasticsearch 1.4, we allowed setting explicitly `"default":null` which is not possible anymore
-as we added a new `ignore_missing` setting.
-When reading an index created with elasticsearch 1.4 and using this, we automatically update it by
-removing `"default": null` and setting `"ignore_missing": false`
+--------------------------
+// AUTOSENSE

+<1> Querying on the `_timestamp` field
+<2> Aggregating on the `_timestamp` field
+<3> Sorting on the `_timestamp` field
+<4> Accessing the `_timestamp` field in scripts (inline scripts must be <<enable-dynamic-scripting,enabled>> for this example to work)
--- a/docs/reference/mapping/fields/ttl-field.asciidoc
+++ b/docs/reference/mapping/fields/ttl-field.asciidoc
@ -1,67 +1,106 @@
 [[mapping-ttl-field]]
 === `_ttl`

-A lot of documents naturally come with an expiration date. Documents can
-therefore have a `_ttl` (time to live), which will cause the expired
-documents to be deleted automatically.
+Some types of documents, such as session data or special offers, come with an
+expiration date. The `_ttl` field allows you to specify the minimum time a
+document should live, after which time the document is deleted automatically.

-`_ttl` accepts two parameters which are described below, every other setting will be silently ignored.
+[TIP]
+.Prefer index-per-timeframe to TTL
+======================================================

-[float]
-==== enabled
+With TTL , expired documents first have to be marked as deleted then later
+purged from the index when segments are merged.  For append-only time-based
+data such as log events, it is much more efficient to use an index-per-day /
+week / month instead of TTLs.  Old log data can be removed by simply deleting
+old indices.

-By default it is disabled, in order to enable it, the following mapping
-should be defined:
+======================================================
+
+The `_ttl` field may be enabled as follows:

 [source,js]
--------------------------------------------------
+-------------------------------
+PUT my_index
 {
-    "tweet" : {
-        "_ttl" : { "enabled" : true }
+  "mappings": {
+    "my_type": {
+      "_ttl": {
+        "enabled": true
+      }
    }
+  }
 }
--------------------------------------------------

-`_ttl` can only be enabled once and never be disabled again.
+PUT my_index/my_type/1?ttl=10m <1>
+{
+  "text": "Will expire in 10 minutes"
+}

-[float]
-==== default
+PUT my_index/my_type/2 <2>
+{
+  "text": "Will not expire"
+}
+-------------------------------
+// AUTOSENSE
+<1> This document will expire 10 minutes after being indexed.
+<2> This document has no TTL set and will not expire.

-You can provide a per index/type default `_ttl` value as follows:
+The expiry time is calculated as the value of the
+<<mapping-timestamp-field,`_timestamp`>> field (or `now()` if the `_timestamp`
+is not enabled) plus the `ttl` specified in the indexing request.
+
+==== Default TTL
+
+You can provide a default `_ttl`, which will be applied to indexing requests where the `ttl` is not specified:

 [source,js]
--------------------------------------------------
+-------------------------------
+PUT my_index
 {
-    "tweet" : {
-        "_ttl" : { "enabled" : true, "default" : "1d" }
+  "mappings": {
+    "my_type": {
+      "_ttl": {
+        "enabled": true,
+        "defaut": "5m"
+      }
    }
+  }
 }
--------------------------------------------------

-In this case, if you don't provide a `_ttl` value in your query or in
-the `_source` all tweets will have a `_ttl` of one day.
+PUT my_index/my_type/1?ttl=10m <1>
+{
+  "text": "Will expire in 10 minutes"
+}

-In case you do not specify a time unit like `d` (days), `m` (minutes),
-`h` (hours), `ms` (milliseconds) or `w` (weeks), milliseconds is used as
-default unit.
+PUT my_index/my_type/2 <2>
+{
+  "text": "Will expire in 5 minutes"
+}
+-------------------------------
+// AUTOSENSE
+<1> This document will expire 10 minutes after being indexed.
+<2> This document has no TTL set and so will expire after the default 5 minutes.

-If no `default` is set and no `_ttl` value is given then the document
-has an infinite `_ttl` and will not expire.
+The `default` value can use <<time-units,time units>> like `d` for days, and
+will use `ms` as the default unit if no time unit is provided.

 You can dynamically update the `default` value using the put mapping
 API. It won't change the `_ttl` of already indexed documents but will be
 used for future documents.

-[float]
 ==== Note on documents expiration

-Expired documents will be automatically deleted regularly. You can
-dynamically set the `indices.ttl.interval` to fit your needs. The
-default value is `60s`.
+Expired documents will be automatically deleted periodoically. The following
+settings control the expiry process:

-The deletion orders are processed by bulk. You can set
-`indices.ttl.bulk_size` to fit your needs. The default value is `10000`.
+`indices.ttl.interval`::
+
+How often the purge process should run. Defaults to `60s`. Expired documents
+may still be retrieved before they are purged.
+
+`indices.ttl.bulk_size`::
+
+How many deletions are handled by a single <<docs-bulk,`bulk`>> request. The
+default value is `10000`.

-Note that the expiration procedure handle versioning properly so if a
-document is updated between the collection of documents to expire and
-the delete order, the document won't be deleted.
--- a/docs/reference/mapping/fields/type-field.asciidoc
+++ b/docs/reference/mapping/fields/type-field.asciidoc
@ -1,7 +1,60 @@
 [[mapping-type-field]]
-=== `_type`
+=== `_type` field
+
+Each document indexed is associated with a <<mapping-type-field,`_type`>> (see
+<<all-mapping-types,Mapping Types>>) and an <<mapping-id-field,`_id`>>.  The
+`_type` field is indexed in order to make searching by type name fast.
+
+The value of the `_type` field is accessible in queries, aggregations,
+scripts, and when sorting:
+
+[source,js]
+--------------------------
+# Example documents
+PUT my_index/type_1/1
+{
+  "text": "Document with type 1"
+}
+
+PUT my_index/type_2/2
+{
+  "text": "Document with type 2"
+}
+
+GET my_index/_search/type_*
+{
+  "query": {
+    "terms": {
+      "_type": [ "type_1", "type_2" ] <1>
+    }
+  },
+  "aggs": {
+    "types": {
+      "terms": {
+        "field": "_type", <2>
+        "size": 10
+      }
+    }
+  },
+  "sort": [
+    {
+      "_type": { <3>
+        "order": "desc"
+      }
+    }
+  ],
+  "script_fields": {
+    "type": {
+      "script": "doc['_type']" <4>
+    }
+  }
+}
+
+--------------------------
+// AUTOSENSE
+
+<1> Querying on the `_type` field
+<2> Aggregating on the `_type` field
+<3> Sorting on the `_type` field
+<4> Accessing the `_type` field in scripts (inline scripts must be <<enable-dynamic-scripting,enabled>> for this example to work)

-Each document indexed is associated with an id and a type. The `_type`
-field allows accessing only the type of a document. It is indexed
-to allow quickly filtering on type, for example, when performing
-a search request on a single or multiple types.
--- a/docs/reference/mapping/fields/uid-field.asciidoc
+++ b/docs/reference/mapping/fields/uid-field.asciidoc
@ -1,10 +1,59 @@
 [[mapping-uid-field]]
-=== `_uid`
+=== `_uid` field

-Each document indexed is associated with an id and a type, the internal
-`_uid` field is the unique identifier of a document within an index and
-is composed of the type and the id (meaning that different types can
-have the same id and still maintain uniqueness).
+Each document indexed is associated with a <<mapping-type-field,`_type`>> (see
+<<all-mapping-types,Mapping Types>>) and an <<mapping-id-field,`_id`>>.  These
+values are combined as `{type}#{id}` and indexed as the `_uid` field.
+
+The value of the `_uid` field is accessible in queries, aggregations, scripts,
+and when sorting:
+
+[source,js]
+--------------------------
+# Example documents
+PUT my_index/my_type/1
+{
+  "text": "Document with ID 1"
+}
+
+PUT my_index/my_type/2
+{
+  "text": "Document with ID 2"
+}
+
+GET my_index/_search
+{
+  "query": {
+    "terms": {
+      "_uid": [ "my_type#1", "my_type#2" ] <1>
+    }
+  },
+  "aggs": {
+    "UIDs": {
+      "terms": {
+        "field": "_uid", <2>
+        "size": 10
+      }
+    }
+  },
+  "sort": [
+    {
+      "_uid": { <3>
+        "order": "desc"
+      }
+    }
+  ],
+  "script_fields": {
+    "UID": {
+      "script": "doc['_uid']" <4>
+    }
+  }
+}
+--------------------------
+// AUTOSENSE
+
+<1> Querying on the `_uid` field (also see the <<query-dsl-ids-query,`ids` query>>)
+<2> Aggregating on the `_uid` field
+<3> Sorting on the `_uid` field
+<4> Accessing the `_uid` field in scripts (inline scripts must be <<enable-dynamic-scripting,enabled>> for this example to work)

-The `_uid` field is for type based filtering, as well as for
-lookups of `_id` and `_type`.
--- a/docs/reference/migration/migrate_2_0.asciidoc
+++ b/docs/reference/migration/migrate_2_0.asciidoc
@ -546,6 +546,12 @@ query to not compute scores and optionally caches the result.

 As a consequence the `query` filter serves no purpose anymore and is deprecated.

+=== Timezone for date field
+
+Specifying the `time_zone` parameter on queries or aggregations of `date` type fields
+must now be either an ISO 8601 UTC offset, or a timezone id. For example, the value
+`+1:00` must now be `+01:00`.
+
 === Snapshot and Restore

 Locations of the shared file system repositories and the URL repositories with `file:` URLs has to be now registered
--- a/docs/reference/query-dsl/filtered-query.asciidoc
+++ b/docs/reference/query-dsl/filtered-query.asciidoc
@ -18,7 +18,7 @@ documents that remain.
      "match": { "tweet": "full text search" }
    },
    "filter": {
-      "range": { "created": { "gte": "now - 1d / d" }}
+      "range": { "created": { "gte": "now-1d/d" }}
    }
  }
 }
@ -37,7 +37,7 @@ curl -XGET localhost:9200/_search -d '
        "match": { "tweet": "full text search" }
      },
      "filter": {
-        "range": { "created": { "gte": "now - 1d / d" }}
+        "range": { "created": { "gte": "now-1d/d" }}
      }
    }
  }
@ -61,7 +61,7 @@ curl -XGET localhost:9200/_search -d '
  "query": {
    "filtered": { <1>
      "filter": {
-        "range": { "created": { "gte": "now - 1d / d" }}
+        "range": { "created": { "gte": "now-1d/d" }}
      }
    }
  }
@ -83,7 +83,7 @@ Multiple filters can be applied by wrapping them in a
    "query": { "match": { "tweet": "full text search" }},
    "filter": {
      "bool": {
-        "must": { "range": { "created": { "gte": "now - 1d / d" }}},
+        "must": { "range": { "created": { "gte": "now-1d/d" }}},
        "should": [
          { "term": { "featured": true }},
          { "term": { "starred":  true }}
--- a/docs/reference/query-dsl/range-query.asciidoc
+++ b/docs/reference/query-dsl/range-query.asciidoc
@ -43,7 +43,7 @@ move them to UTC time based date:
        "born" : {
            "gte": "2012-01-01",
            "lte": "now",
-            "time_zone": "+1:00"
+            "time_zone": "+01:00"
        }
    }
 }
--- a/plugins/pom.xml
+++ b/plugins/pom.xml
@ -321,8 +321,6 @@
                            <configuration>
                                <target>
                                    <ant antfile="${elasticsearch.integ.antfile}" target="stop-external-cluster"/>
-                                    <!-- TODO: remove this and the xslt when junit4 is fixed -->
-                                    <ant antfile="${elasticsearch.integ.antfile}" target="fixup-failsafe-summary"/>
                                </target>
                            </configuration>
                        </execution>
--- a/pom.xml
+++ b/pom.xml
@ -43,7 +43,7 @@
        <!-- libraries -->
        <lucene.version>5.2.1</lucene.version>
        <lucene.maven.version>5.2.1</lucene.maven.version>
-        <testframework.version>2.1.15</testframework.version>
+        <testframework.version>2.1.16</testframework.version>
        <jackson.version>2.5.3</jackson.version>
        <slf4j.version>1.6.2</slf4j.version>
        <log4j.version>1.2.17</log4j.version>
@ -718,7 +718,7 @@
                                <listeners>
                                    <report-ant-xml 
                                        mavenExtensions="true" 
-                                        summaryFile="${project.build.directory}/failsafe-reports/failsafe-summary-buggy.xml"
+                                        summaryFile="${project.build.directory}/failsafe-reports/failsafe-summary.xml"
                                        dir="${project.build.directory}/failsafe-reports"/>
                                </listeners>
                                <!-- currently only 1 cpu works, because integ tests don't make "unique" test directories? -->
--- a/rest-api-spec/src/main/resources/rest-api-spec/test/cat.shards/10_basic.yaml
+++ b/rest-api-spec/src/main/resources/rest-api-spec/test/cat.shards/10_basic.yaml
@ -60,6 +60,9 @@
                    search.query_current             .+   \n
                    search.query_time                .+   \n
                    search.query_total               .+   \n
+                    search.scroll_current            .+   \n
+                    search.scroll_time               .+   \n
+                    search.scroll_total              .+   \n
                    segments.count                   .+   \n
                    segments.memory                  .+   \n
                    segments.index_writer_memory     .+   \n