Merge pull request #12994 from jpountz/deprecate/scan

Deprecate the `scan` search type.
2015-08-20 12:47:45 +02:00 · 2015-08-20 12:47:45 +02:00 · 41d8fbe8f5
parent 833f821171 6fa258b8fa
commit 41d8fbe8f5
13 changed files with 121 additions and 121 deletions
--- a/core/src/main/java/org/elasticsearch/action/search/SearchType.java
+++ b/core/src/main/java/org/elasticsearch/action/search/SearchType.java
@ -54,7 +54,9 @@ public enum SearchType {
    /**
     * Performs scanning of the results which executes the search without any sorting.
     * It will automatically start scrolling the result set.
+     * @deprecated will be removed in 3.0, you should do a regular scroll instead, ordered by `_doc`
     */
+    @Deprecated
    SCAN((byte) 4),
    /**
     * Only counts the results, will still execute aggregations and the like.
@ -69,6 +71,7 @@ public enum SearchType {
    public static final SearchType DEFAULT = QUERY_THEN_FETCH;

    private static final ParseField COUNT_VALUE = new ParseField("count").withAllDeprecated("query_then_fetch");
+    private static final ParseField SCAN_VALUE = new ParseField("scan").withAllDeprecated("query_then_fetch sorting on `_doc`");

    private byte id;

@ -121,7 +124,7 @@ public enum SearchType {
            return SearchType.QUERY_THEN_FETCH;
        } else if ("query_and_fetch".equals(searchType)) {
            return SearchType.QUERY_AND_FETCH;
-        } else if ("scan".equals(searchType)) {
+        } else if (parseFieldMatcher.match(searchType, SCAN_VALUE)) {
            return SearchType.SCAN;
        } else if (parseFieldMatcher.match(searchType, COUNT_VALUE)) {
            return SearchType.COUNT;
--- a/core/src/main/java/org/elasticsearch/action/search/type/TransportSearchScanAction.java
+++ b/core/src/main/java/org/elasticsearch/action/search/type/TransportSearchScanAction.java
@ -40,6 +40,7 @@ import org.elasticsearch.threadpool.ThreadPool;

 import static org.elasticsearch.action.search.type.TransportSearchHelper.buildScrollId;

+@Deprecated // remove in 3.0
 public class TransportSearchScanAction extends TransportSearchTypeAction {

    @Inject
--- a/core/src/main/java/org/elasticsearch/search/SearchService.java
+++ b/core/src/main/java/org/elasticsearch/search/SearchService.java
@ -263,6 +263,7 @@ public class SearchService extends AbstractLifecycleComponent<SearchService> {
        }
    }

+    @Deprecated // remove in 3.0
    public QuerySearchResult executeScan(ShardSearchRequest request) {
        final SearchContext context = createAndPutContext(request);
        final int originalSize = context.size();
--- a/core/src/main/java/org/elasticsearch/search/action/SearchServiceTransportAction.java
+++ b/core/src/main/java/org/elasticsearch/search/action/SearchServiceTransportAction.java
@ -418,6 +418,7 @@ public class SearchServiceTransportAction extends AbstractComponent {
        }
    }

+    @Deprecated // remove in 3.0
    class SearchScanTransportHandler implements TransportRequestHandler<ShardSearchTransportRequest> {
        @Override
        public void messageReceived(ShardSearchTransportRequest request, TransportChannel channel) throws Exception {
--- a/docs/plugins/delete-by-query.asciidoc
+++ b/docs/plugins/delete-by-query.asciidoc
@ -6,7 +6,7 @@ The delete-by-query plugin adds support for deleting all of the documents
 replacement for the problematic _delete-by-query_ functionality which has been
 removed from Elasticsearch core.

-Internally, it uses the {ref}/search-request-scroll.html#scroll-scan[Scan/Scroll]
+Internally, it uses {ref}/search-request-scroll.html[Scroll]
 and {ref}/docs-bulk.html[Bulk] APIs to delete documents in an efficient and
 safe manner. It is slower than the old _delete-by-query_ functionality, but
 fixes the problems with the previous implementation.
@ -101,7 +101,7 @@ See {ref}/search-uri-request.html[URI search request] for details.

 `size`::

-The number of hits returned *per shard* by the {ref}/search-request-scroll.html#scroll-scan[scan]
+The number of hits returned by the {ref}/search-request-scroll.html[scroll]
 request.  Defaults to 10.  May also be specified in the request body.

 `timeout`::
@ -148,7 +148,7 @@ The JSON response looks like this:
 --------------------------------------------------

 Internally, the query is used to execute an initial
-{ref}/search-request-scroll.html#scroll-scan[scroll/scan] request. As hits are
+{ref}/search-request-scroll.html[scroll] request. As hits are
 pulled from the scroll API, they are passed to the {ref}/docs-bulk.html[Bulk
 API] for deletion.

@ -157,7 +157,7 @@ was visible to search at the time the request was executed.  Any documents
 that have been reindexed or updated during execution will not be deleted.

 Since documents can be updated or deleted by external operations during the
-_scan-scroll-bulk_ process, the plugin keeps track of different counters for
+_scroll-bulk_ process, the plugin keeps track of different counters for
 each index, with the totals displayed under the `_all` index.  The counters
 are as follows:

@ -212,7 +212,7 @@ Resiliency::
 === New delete-by-query implementation

 The new implementation, provided by this plugin, is built internally
-using  {ref}/search-request-scroll.html#scroll-scan[scan and scroll] to return
+using  {ref}/search-request-scroll.html[scroll] to return
 the document IDs and versions of all the documents that need to be deleted.
 It then uses  the {ref}/docs-bulk.html[`bulk` API] to do the actual deletion.

@ -231,8 +231,8 @@ try-once::

 syntactic sugar::

-    A delete-by-query is equivalent to a scan/scroll search and corresponding
-    bulk-deletes by ID.
+    A delete-by-query is equivalent to a scroll search ordered by `_doc` and
+    corresponding bulk-deletes by ID.

 point-in-time::

@ -267,4 +267,4 @@ move the functionality to a plugin instead of replacing the feautre in core:
 * There is currently no way to monitor or cancel a running delete-by-query
  request, except for the `timeout` parameter.

-We have plans to solve both of these issues in a later version of Elasticsearch.
+We have plans to solve both of these issues in a later version of Elasticsearch.
--- a/docs/reference/migration/index.asciidoc
+++ b/docs/reference/migration/index.asciidoc
@ -16,6 +16,7 @@ As a general rule:

 See <<setup-upgrade>> for more info.
 --
+include::migrate_2_1.asciidoc[]

 include::migrate_2_0.asciidoc[]

--- a/docs/reference/migration/migrate_2_0/removals.asciidoc
+++ b/docs/reference/migration/migrate_2_0/removals.asciidoc
@ -27,8 +27,8 @@ The old delete-by-query functionality was fast but unsafe.  It could lead to
 document differences between the primary and replica shards, and could even
 produce out of memory exceptions and cause the cluster to crash.

-This feature has been reimplemented using the <<scroll-scan,scroll/scan>> and
-the <<docs-bulk,`bulk`>> API, which may be slower for queries which match
+This feature has been reimplemented using the <<search-request-scroll,scroll>> and
+<<docs-bulk,`bulk`>> APIs, which may be slower for queries which match
 large numbers of documents, but is safe.

 Currently, a long running delete-by-query job cannot be cancelled, which is
--- a/docs/reference/migration/migrate_2_1.asciidoc
+++ b/docs/reference/migration/migrate_2_1.asciidoc
@ -0,0 +1,27 @@
+[[breaking-changes-2.1]]
+== Breaking changes in 2.1
+
+This section discusses the changes that you need to be aware of when migrating
+your application to Elasticsearch 2.1.
+
+=== Search changes
+
+==== `search_type=scan` deprecated
+
+The `scan` search type has been deprecated. All benefits from this search
+type can now be achieved by doing a scroll request that sorts documents in
+`_doc` order, for instance:
+
+[source,sh]
+---------------
+GET /my_index/_search?scroll=2m
+{
+  "sort": [
+    "_doc"
+  ]
+}
+---------------
+
+Scroll requests sorted by `_doc` have been optimized to more efficiently resume
+from where the previous request stopped, so this will have the same performance
+characteristics as the former `scan` search type.
--- a/docs/reference/search/request-body.asciidoc
+++ b/docs/reference/search/request-body.asciidoc
@ -64,7 +64,7 @@ And here is a sample response:
 `search_type`::

    The type of the search operation to perform. Can be
-    `dfs_query_then_fetch`, `query_then_fetch`, or 'scan'.
+    `dfs_query_then_fetch` or `query_then_fetch`.
    Defaults to `query_then_fetch`.
    See <<search-request-search-type,_Search Type_>> for more.

--- a/docs/reference/search/request/scroll.asciidoc
+++ b/docs/reference/search/request/scroll.asciidoc
@ -90,59 +90,20 @@ used.
 NOTE: If the request specifies aggregations, only the initial search response
 will contain the aggregations results.

-[[scroll-scan]]
-==== Efficient scrolling with Scroll-Scan
-
-Deep pagination with <<search-request-from-size,`from` and `size`>> -- e.g.
-`?size=10&from=10000` -- is very inefficient as (in this example) 100,000
-sorted results have to be retrieved from each shard and resorted in order to
-return just 10 results.  This process has to be repeated for every page
-requested.
-
-The `scroll` API keeps track of which results have already been returned and
-so is able to return sorted results more efficiently than with deep
-pagination.  However, sorting results (which happens by default) still has a
-cost.
-
-Normally, you just want to retrieve all results and the order doesn't matter.
-Scrolling can be combined with the <<scan,`scan`>> search type to disable
-any scoring or sorting and to return results in the most efficient way
-possible.  All that is needed is to add `search_type=scan` to the query string
-of the initial search request:
+NOTE: Scroll requests have optimizations that make them faster when the sort
+order is `_doc`. If you want to iterate over all documents regardless of the
+order, this is the most efficient option:

 [source,js]
 --------------------------------------------------
-curl 'localhost:9200/twitter/tweet/_search?scroll=1m&search_type=scan' <1> -d '
+curl -XGET 'localhost:9200/_search?scroll=1m' -d '
 {
-    "query": {
-        "match" : {
-            "title" : "elasticsearch"
-        }
-    }
+  "sort": [
+    "_doc"
+  }
 }
 '
 --------------------------------------------------
-<1> Setting `search_type` to `scan` disables sorting and makes scrolling
-    very efficient.
-
-A scanning scroll request differs from a standard scroll request in four
-ways:
-
-* No score is calculated and sorting is disabled. Results are returned in
-  the order they appear in the index.
-
-* Aggregations are not supported.
-
-* The response of the initial `search` request will not contain any results in
-  the `hits` array. The first results will be returned by the first `scroll`
-  request.
-
-* The <<search-request-from-size,`size` parameter>> controls the number of
-  results *per shard*, not per request, so a `size` of `10` which hits 5
-  shards will return a maximum of 50 results per `scroll` request.
-
-If you want the scoring to happen, even without sorting on it, set the
-`track_scores` parameter to `true`.

 [[scroll-search-context]]
 ==== Keeping the search context alive
--- a/docs/reference/search/request/search-type.asciidoc
+++ b/docs/reference/search/request/search-type.asciidoc
@ -26,8 +26,8 @@ each shard using these global frequencies.
 Also, because of the need to sort the results, getting back a large
 document set, or even scrolling it, while maintaining the correct sorting
 behavior can be a very expensive operation. For large result set
-scrolling without sorting, the `scan` search type (explained below) is
-also available.
+scrolling, it is best to sort by `_doc` if the order in which documents
+are returned is not important.

 Elasticsearch is very flexible and allows to control the type of search
 to execute on a *per search request* basis. The type can be configured
@ -77,9 +77,11 @@ API as it provides more options.
 [[scan]]
 ==== Scan

+deprecated[2.1.0, `scan` does not provide any benefits over a regular `scroll` request sorted by `_doc`]
+
 Parameter value: *scan*.

 The `scan` search type disables sorting in order to allow very efficient
-scrolling through large result sets.  See <<scroll-scan>> for more.
+scrolling through large result sets.


--- a/docs/reference/search/uri-request.asciidoc
+++ b/docs/reference/search/uri-request.asciidoc
@ -103,7 +103,9 @@ Defaults to no terminate_after.
 |`size` |The number of hits to return. Defaults to `10`.

 |`search_type` |The type of the search operation to perform. Can be
-`dfs_query_then_fetch`, `query_then_fetch`, `scan` or `count`
+`dfs_query_then_fetch`, `query_then_fetch`, `scan`
+deprecated[2.1.0,Replaced by a regular `scroll` sorted by `_doc`]
+or `count`
 deprecated[2.0.0-beta1,Replaced by `size: 0`]. Defaults to `query_then_fetch`. See
 <<search-request-search-type,_Search Type_>> for
 more details on the different types of search that can be performed.
--- a/plugins/delete-by-query/src/main/java/org/elasticsearch/action/deletebyquery/TransportDeleteByQueryAction.java
+++ b/plugins/delete-by-query/src/main/java/org/elasticsearch/action/deletebyquery/TransportDeleteByQueryAction.java
@ -102,12 +102,17 @@ public class TransportDeleteByQueryAction extends HandledTransportAction<DeleteB
        void executeScan() {
            try {
                final SearchRequest scanRequest = new SearchRequest(request.indices()).types(request.types()).indicesOptions(request.indicesOptions());
-                scanRequest.searchType(SearchType.SCAN).scroll(request.scroll());
+                scanRequest.scroll(request.scroll());
                if (request.routing() != null) {
                    scanRequest.routing(request.routing());
                }

-                SearchSourceBuilder source = new SearchSourceBuilder().query(request.source()).fields("_routing", "_parent").fetchSource(false).version(true);
+                SearchSourceBuilder source = new SearchSourceBuilder()
+                        .query(request.source())
+                        .fields("_routing", "_parent")
+                        .sort("_doc") // important for performance
+                        .fetchSource(false)
+                        .version(true);
                if (request.size() > 0) {
                    source.size(request.size());
                }
@ -121,17 +126,9 @@ public class TransportDeleteByQueryAction extends HandledTransportAction<DeleteB
                    @Override
                    public void onResponse(SearchResponse searchResponse) {
                        long hits = searchResponse.getHits().getTotalHits();
-                        logger.trace("scan request executed: found [{}] document(s) to delete", hits);
-                        addShardFailures(searchResponse.getShardFailures());
-
-                        if (hits == 0) {
-                            finishHim(searchResponse.getScrollId(), false, null);
-                            return;
-                        }
+                        logger.trace("first request executed: found [{}] document(s) to delete", hits);
                        total.set(hits);
-
-                        logger.trace("start scrolling [{}] document(s)", hits);
-                        executeScroll(searchResponse.getScrollId());
+                        deleteHits(null, searchResponse);
                    }

                    @Override
@ -151,53 +148,7 @@ public class TransportDeleteByQueryAction extends HandledTransportAction<DeleteB
                scrollAction.execute(new SearchScrollRequest(scrollId).scroll(request.scroll()), new ActionListener<SearchResponse>() {
                    @Override
                    public void onResponse(SearchResponse scrollResponse) {
-                        final SearchHit[] docs = scrollResponse.getHits().getHits();
-                        final String nextScrollId = scrollResponse.getScrollId();
-                        addShardFailures(scrollResponse.getShardFailures());
-
-                        if (logger.isTraceEnabled()) {
-                            logger.trace("scroll request [{}] executed: [{}] document(s) returned", scrollId, docs.length);
-                        }
-
-                        if ((docs.length == 0) || (nextScrollId == null)) {
-                            logger.trace("scrolling documents terminated");
-                            finishHim(scrollId, false, null);
-                            return;
-                        }
-
-                        if (hasTimedOut()) {
-                            logger.trace("scrolling documents timed out");
-                            finishHim(scrollId, true, null);
-                            return;
-                        }
-
-                        // Delete the scrolled documents using the Bulk API
-                        BulkRequest bulkRequest = new BulkRequest();
-                        for (SearchHit doc : docs) {
-                            DeleteRequest delete = new DeleteRequest(doc.index(), doc.type(), doc.id()).version(doc.version());
-                            SearchHitField routing = doc.field("_routing");
-                            if (routing != null) {
-                                delete.routing((String) routing.value());
-                            }
-                            SearchHitField parent = doc.field("_parent");
-                            if (parent != null) {
-                                delete.parent((String) parent.value());
-                            }
-                            bulkRequest.add(delete);
-                        }
-
-                        logger.trace("executing bulk request with [{}] deletions", bulkRequest.numberOfActions());
-                        client.bulk(bulkRequest, new ActionListener<BulkResponse>() {
-                            @Override
-                            public void onResponse(BulkResponse bulkResponse) {
-                                onBulkResponse(nextScrollId, bulkResponse);
-                            }
-
-                            @Override
-                            public void onFailure(Throwable e) {
-                                onBulkFailure(nextScrollId, docs, e);
-                            }
-                        });
+                        deleteHits(scrollId, scrollResponse);
                    }

                    @Override
@ -212,6 +163,56 @@ public class TransportDeleteByQueryAction extends HandledTransportAction<DeleteB
            }
        }

+        void deleteHits(String scrollId, SearchResponse scrollResponse) {
+            final SearchHit[] docs = scrollResponse.getHits().getHits();
+            final String nextScrollId = scrollResponse.getScrollId();
+            addShardFailures(scrollResponse.getShardFailures());
+
+            if (logger.isTraceEnabled()) {
+                logger.trace("scroll request [{}] executed: [{}] document(s) returned", scrollId, docs.length);
+            }
+
+            if ((docs.length == 0) || (nextScrollId == null)) {
+                logger.trace("scrolling documents terminated");
+                finishHim(scrollId, false, null);
+                return;
+            }
+
+            if (hasTimedOut()) {
+                logger.trace("scrolling documents timed out");
+                finishHim(scrollId, true, null);
+                return;
+            }
+
+            // Delete the scrolled documents using the Bulk API
+            BulkRequest bulkRequest = new BulkRequest();
+            for (SearchHit doc : docs) {
+                DeleteRequest delete = new DeleteRequest(doc.index(), doc.type(), doc.id()).version(doc.version());
+                SearchHitField routing = doc.field("_routing");
+                if (routing != null) {
+                    delete.routing((String) routing.value());
+                }
+                SearchHitField parent = doc.field("_parent");
+                if (parent != null) {
+                    delete.parent((String) parent.value());
+                }
+                bulkRequest.add(delete);
+            }
+
+            logger.trace("executing bulk request with [{}] deletions", bulkRequest.numberOfActions());
+            client.bulk(bulkRequest, new ActionListener<BulkResponse>() {
+                @Override
+                public void onResponse(BulkResponse bulkResponse) {
+                    onBulkResponse(nextScrollId, bulkResponse);
+                }
+
+                @Override
+                public void onFailure(Throwable e) {
+                    onBulkFailure(nextScrollId, docs, e);
+                }
+            });
+        }
+
        void onBulkResponse(String scrollId, BulkResponse bulkResponse) {
            try {
                for (BulkItemResponse item : bulkResponse.getItems()) {