Merge pull request #12994 from jpountz/deprecate/scan

Deprecate the `scan` search type.
This commit is contained in:
Adrien Grand 2015-08-20 12:47:45 +02:00
commit 41d8fbe8f5
13 changed files with 121 additions and 121 deletions

View File

@ -54,7 +54,9 @@ public enum SearchType {
/** /**
* Performs scanning of the results which executes the search without any sorting. * Performs scanning of the results which executes the search without any sorting.
* It will automatically start scrolling the result set. * It will automatically start scrolling the result set.
* @deprecated will be removed in 3.0, you should do a regular scroll instead, ordered by `_doc`
*/ */
@Deprecated
SCAN((byte) 4), SCAN((byte) 4),
/** /**
* Only counts the results, will still execute aggregations and the like. * Only counts the results, will still execute aggregations and the like.
@ -69,6 +71,7 @@ public enum SearchType {
public static final SearchType DEFAULT = QUERY_THEN_FETCH; public static final SearchType DEFAULT = QUERY_THEN_FETCH;
private static final ParseField COUNT_VALUE = new ParseField("count").withAllDeprecated("query_then_fetch"); private static final ParseField COUNT_VALUE = new ParseField("count").withAllDeprecated("query_then_fetch");
private static final ParseField SCAN_VALUE = new ParseField("scan").withAllDeprecated("query_then_fetch sorting on `_doc`");
private byte id; private byte id;
@ -121,7 +124,7 @@ public enum SearchType {
return SearchType.QUERY_THEN_FETCH; return SearchType.QUERY_THEN_FETCH;
} else if ("query_and_fetch".equals(searchType)) { } else if ("query_and_fetch".equals(searchType)) {
return SearchType.QUERY_AND_FETCH; return SearchType.QUERY_AND_FETCH;
} else if ("scan".equals(searchType)) { } else if (parseFieldMatcher.match(searchType, SCAN_VALUE)) {
return SearchType.SCAN; return SearchType.SCAN;
} else if (parseFieldMatcher.match(searchType, COUNT_VALUE)) { } else if (parseFieldMatcher.match(searchType, COUNT_VALUE)) {
return SearchType.COUNT; return SearchType.COUNT;

View File

@ -40,6 +40,7 @@ import org.elasticsearch.threadpool.ThreadPool;
import static org.elasticsearch.action.search.type.TransportSearchHelper.buildScrollId; import static org.elasticsearch.action.search.type.TransportSearchHelper.buildScrollId;
@Deprecated // remove in 3.0
public class TransportSearchScanAction extends TransportSearchTypeAction { public class TransportSearchScanAction extends TransportSearchTypeAction {
@Inject @Inject

View File

@ -263,6 +263,7 @@ public class SearchService extends AbstractLifecycleComponent<SearchService> {
} }
} }
@Deprecated // remove in 3.0
public QuerySearchResult executeScan(ShardSearchRequest request) { public QuerySearchResult executeScan(ShardSearchRequest request) {
final SearchContext context = createAndPutContext(request); final SearchContext context = createAndPutContext(request);
final int originalSize = context.size(); final int originalSize = context.size();

View File

@ -418,6 +418,7 @@ public class SearchServiceTransportAction extends AbstractComponent {
} }
} }
@Deprecated // remove in 3.0
class SearchScanTransportHandler implements TransportRequestHandler<ShardSearchTransportRequest> { class SearchScanTransportHandler implements TransportRequestHandler<ShardSearchTransportRequest> {
@Override @Override
public void messageReceived(ShardSearchTransportRequest request, TransportChannel channel) throws Exception { public void messageReceived(ShardSearchTransportRequest request, TransportChannel channel) throws Exception {

View File

@ -6,7 +6,7 @@ The delete-by-query plugin adds support for deleting all of the documents
replacement for the problematic _delete-by-query_ functionality which has been replacement for the problematic _delete-by-query_ functionality which has been
removed from Elasticsearch core. removed from Elasticsearch core.
Internally, it uses the {ref}/search-request-scroll.html#scroll-scan[Scan/Scroll] Internally, it uses {ref}/search-request-scroll.html[Scroll]
and {ref}/docs-bulk.html[Bulk] APIs to delete documents in an efficient and and {ref}/docs-bulk.html[Bulk] APIs to delete documents in an efficient and
safe manner. It is slower than the old _delete-by-query_ functionality, but safe manner. It is slower than the old _delete-by-query_ functionality, but
fixes the problems with the previous implementation. fixes the problems with the previous implementation.
@ -101,7 +101,7 @@ See {ref}/search-uri-request.html[URI search request] for details.
`size`:: `size`::
The number of hits returned *per shard* by the {ref}/search-request-scroll.html#scroll-scan[scan] The number of hits returned by the {ref}/search-request-scroll.html[scroll]
request. Defaults to 10. May also be specified in the request body. request. Defaults to 10. May also be specified in the request body.
`timeout`:: `timeout`::
@ -148,7 +148,7 @@ The JSON response looks like this:
-------------------------------------------------- --------------------------------------------------
Internally, the query is used to execute an initial Internally, the query is used to execute an initial
{ref}/search-request-scroll.html#scroll-scan[scroll/scan] request. As hits are {ref}/search-request-scroll.html[scroll] request. As hits are
pulled from the scroll API, they are passed to the {ref}/docs-bulk.html[Bulk pulled from the scroll API, they are passed to the {ref}/docs-bulk.html[Bulk
API] for deletion. API] for deletion.
@ -157,7 +157,7 @@ was visible to search at the time the request was executed. Any documents
that have been reindexed or updated during execution will not be deleted. that have been reindexed or updated during execution will not be deleted.
Since documents can be updated or deleted by external operations during the Since documents can be updated or deleted by external operations during the
_scan-scroll-bulk_ process, the plugin keeps track of different counters for _scroll-bulk_ process, the plugin keeps track of different counters for
each index, with the totals displayed under the `_all` index. The counters each index, with the totals displayed under the `_all` index. The counters
are as follows: are as follows:
@ -212,7 +212,7 @@ Resiliency::
=== New delete-by-query implementation === New delete-by-query implementation
The new implementation, provided by this plugin, is built internally The new implementation, provided by this plugin, is built internally
using {ref}/search-request-scroll.html#scroll-scan[scan and scroll] to return using {ref}/search-request-scroll.html[scroll] to return
the document IDs and versions of all the documents that need to be deleted. the document IDs and versions of all the documents that need to be deleted.
It then uses the {ref}/docs-bulk.html[`bulk` API] to do the actual deletion. It then uses the {ref}/docs-bulk.html[`bulk` API] to do the actual deletion.
@ -231,8 +231,8 @@ try-once::
syntactic sugar:: syntactic sugar::
A delete-by-query is equivalent to a scan/scroll search and corresponding A delete-by-query is equivalent to a scroll search ordered by `_doc` and
bulk-deletes by ID. corresponding bulk-deletes by ID.
point-in-time:: point-in-time::

View File

@ -16,6 +16,7 @@ As a general rule:
See <<setup-upgrade>> for more info. See <<setup-upgrade>> for more info.
-- --
include::migrate_2_1.asciidoc[]
include::migrate_2_0.asciidoc[] include::migrate_2_0.asciidoc[]

View File

@ -27,8 +27,8 @@ The old delete-by-query functionality was fast but unsafe. It could lead to
document differences between the primary and replica shards, and could even document differences between the primary and replica shards, and could even
produce out of memory exceptions and cause the cluster to crash. produce out of memory exceptions and cause the cluster to crash.
This feature has been reimplemented using the <<scroll-scan,scroll/scan>> and This feature has been reimplemented using the <<search-request-scroll,scroll>> and
the <<docs-bulk,`bulk`>> API, which may be slower for queries which match <<docs-bulk,`bulk`>> APIs, which may be slower for queries which match
large numbers of documents, but is safe. large numbers of documents, but is safe.
Currently, a long running delete-by-query job cannot be cancelled, which is Currently, a long running delete-by-query job cannot be cancelled, which is

View File

@ -0,0 +1,27 @@
[[breaking-changes-2.1]]
== Breaking changes in 2.1
This section discusses the changes that you need to be aware of when migrating
your application to Elasticsearch 2.1.
=== Search changes
==== `search_type=scan` deprecated
The `scan` search type has been deprecated. All benefits from this search
type can now be achieved by doing a scroll request that sorts documents in
`_doc` order, for instance:
[source,sh]
---------------
GET /my_index/_search?scroll=2m
{
"sort": [
"_doc"
]
}
---------------
Scroll requests sorted by `_doc` have been optimized to more efficiently resume
from where the previous request stopped, so this will have the same performance
characteristics as the former `scan` search type.

View File

@ -64,7 +64,7 @@ And here is a sample response:
`search_type`:: `search_type`::
The type of the search operation to perform. Can be The type of the search operation to perform. Can be
`dfs_query_then_fetch`, `query_then_fetch`, or 'scan'. `dfs_query_then_fetch` or `query_then_fetch`.
Defaults to `query_then_fetch`. Defaults to `query_then_fetch`.
See <<search-request-search-type,_Search Type_>> for more. See <<search-request-search-type,_Search Type_>> for more.

View File

@ -90,59 +90,20 @@ used.
NOTE: If the request specifies aggregations, only the initial search response NOTE: If the request specifies aggregations, only the initial search response
will contain the aggregations results. will contain the aggregations results.
[[scroll-scan]] NOTE: Scroll requests have optimizations that make them faster when the sort
==== Efficient scrolling with Scroll-Scan order is `_doc`. If you want to iterate over all documents regardless of the
order, this is the most efficient option:
Deep pagination with <<search-request-from-size,`from` and `size`>> -- e.g.
`?size=10&from=10000` -- is very inefficient as (in this example) 100,000
sorted results have to be retrieved from each shard and resorted in order to
return just 10 results. This process has to be repeated for every page
requested.
The `scroll` API keeps track of which results have already been returned and
so is able to return sorted results more efficiently than with deep
pagination. However, sorting results (which happens by default) still has a
cost.
Normally, you just want to retrieve all results and the order doesn't matter.
Scrolling can be combined with the <<scan,`scan`>> search type to disable
any scoring or sorting and to return results in the most efficient way
possible. All that is needed is to add `search_type=scan` to the query string
of the initial search request:
[source,js] [source,js]
-------------------------------------------------- --------------------------------------------------
curl 'localhost:9200/twitter/tweet/_search?scroll=1m&search_type=scan' <1> -d ' curl -XGET 'localhost:9200/_search?scroll=1m' -d '
{ {
"query": { "sort": [
"match" : { "_doc"
"title" : "elasticsearch"
}
} }
} }
' '
-------------------------------------------------- --------------------------------------------------
<1> Setting `search_type` to `scan` disables sorting and makes scrolling
very efficient.
A scanning scroll request differs from a standard scroll request in four
ways:
* No score is calculated and sorting is disabled. Results are returned in
the order they appear in the index.
* Aggregations are not supported.
* The response of the initial `search` request will not contain any results in
the `hits` array. The first results will be returned by the first `scroll`
request.
* The <<search-request-from-size,`size` parameter>> controls the number of
results *per shard*, not per request, so a `size` of `10` which hits 5
shards will return a maximum of 50 results per `scroll` request.
If you want the scoring to happen, even without sorting on it, set the
`track_scores` parameter to `true`.
[[scroll-search-context]] [[scroll-search-context]]
==== Keeping the search context alive ==== Keeping the search context alive

View File

@ -26,8 +26,8 @@ each shard using these global frequencies.
Also, because of the need to sort the results, getting back a large Also, because of the need to sort the results, getting back a large
document set, or even scrolling it, while maintaining the correct sorting document set, or even scrolling it, while maintaining the correct sorting
behavior can be a very expensive operation. For large result set behavior can be a very expensive operation. For large result set
scrolling without sorting, the `scan` search type (explained below) is scrolling, it is best to sort by `_doc` if the order in which documents
also available. are returned is not important.
Elasticsearch is very flexible and allows to control the type of search Elasticsearch is very flexible and allows to control the type of search
to execute on a *per search request* basis. The type can be configured to execute on a *per search request* basis. The type can be configured
@ -77,9 +77,11 @@ API as it provides more options.
[[scan]] [[scan]]
==== Scan ==== Scan
deprecated[2.1.0, `scan` does not provide any benefits over a regular `scroll` request sorted by `_doc`]
Parameter value: *scan*. Parameter value: *scan*.
The `scan` search type disables sorting in order to allow very efficient The `scan` search type disables sorting in order to allow very efficient
scrolling through large result sets. See <<scroll-scan>> for more. scrolling through large result sets.

View File

@ -103,7 +103,9 @@ Defaults to no terminate_after.
|`size` |The number of hits to return. Defaults to `10`. |`size` |The number of hits to return. Defaults to `10`.
|`search_type` |The type of the search operation to perform. Can be |`search_type` |The type of the search operation to perform. Can be
`dfs_query_then_fetch`, `query_then_fetch`, `scan` or `count` `dfs_query_then_fetch`, `query_then_fetch`, `scan`
deprecated[2.1.0,Replaced by a regular `scroll` sorted by `_doc`]
or `count`
deprecated[2.0.0-beta1,Replaced by `size: 0`]. Defaults to `query_then_fetch`. See deprecated[2.0.0-beta1,Replaced by `size: 0`]. Defaults to `query_then_fetch`. See
<<search-request-search-type,_Search Type_>> for <<search-request-search-type,_Search Type_>> for
more details on the different types of search that can be performed. more details on the different types of search that can be performed.

View File

@ -102,12 +102,17 @@ public class TransportDeleteByQueryAction extends HandledTransportAction<DeleteB
void executeScan() { void executeScan() {
try { try {
final SearchRequest scanRequest = new SearchRequest(request.indices()).types(request.types()).indicesOptions(request.indicesOptions()); final SearchRequest scanRequest = new SearchRequest(request.indices()).types(request.types()).indicesOptions(request.indicesOptions());
scanRequest.searchType(SearchType.SCAN).scroll(request.scroll()); scanRequest.scroll(request.scroll());
if (request.routing() != null) { if (request.routing() != null) {
scanRequest.routing(request.routing()); scanRequest.routing(request.routing());
} }
SearchSourceBuilder source = new SearchSourceBuilder().query(request.source()).fields("_routing", "_parent").fetchSource(false).version(true); SearchSourceBuilder source = new SearchSourceBuilder()
.query(request.source())
.fields("_routing", "_parent")
.sort("_doc") // important for performance
.fetchSource(false)
.version(true);
if (request.size() > 0) { if (request.size() > 0) {
source.size(request.size()); source.size(request.size());
} }
@ -121,17 +126,9 @@ public class TransportDeleteByQueryAction extends HandledTransportAction<DeleteB
@Override @Override
public void onResponse(SearchResponse searchResponse) { public void onResponse(SearchResponse searchResponse) {
long hits = searchResponse.getHits().getTotalHits(); long hits = searchResponse.getHits().getTotalHits();
logger.trace("scan request executed: found [{}] document(s) to delete", hits); logger.trace("first request executed: found [{}] document(s) to delete", hits);
addShardFailures(searchResponse.getShardFailures());
if (hits == 0) {
finishHim(searchResponse.getScrollId(), false, null);
return;
}
total.set(hits); total.set(hits);
deleteHits(null, searchResponse);
logger.trace("start scrolling [{}] document(s)", hits);
executeScroll(searchResponse.getScrollId());
} }
@Override @Override
@ -151,6 +148,22 @@ public class TransportDeleteByQueryAction extends HandledTransportAction<DeleteB
scrollAction.execute(new SearchScrollRequest(scrollId).scroll(request.scroll()), new ActionListener<SearchResponse>() { scrollAction.execute(new SearchScrollRequest(scrollId).scroll(request.scroll()), new ActionListener<SearchResponse>() {
@Override @Override
public void onResponse(SearchResponse scrollResponse) { public void onResponse(SearchResponse scrollResponse) {
deleteHits(scrollId, scrollResponse);
}
@Override
public void onFailure(Throwable e) {
logger.error("scroll request [{}] failed, scrolling document(s) is stopped", e, scrollId);
finishHim(scrollId, hasTimedOut(), e);
}
});
} catch (Throwable t) {
logger.error("unable to execute scroll request [{}]", t, scrollId);
finishHim(scrollId, false, t);
}
}
void deleteHits(String scrollId, SearchResponse scrollResponse) {
final SearchHit[] docs = scrollResponse.getHits().getHits(); final SearchHit[] docs = scrollResponse.getHits().getHits();
final String nextScrollId = scrollResponse.getScrollId(); final String nextScrollId = scrollResponse.getScrollId();
addShardFailures(scrollResponse.getShardFailures()); addShardFailures(scrollResponse.getShardFailures());
@ -200,18 +213,6 @@ public class TransportDeleteByQueryAction extends HandledTransportAction<DeleteB
}); });
} }
@Override
public void onFailure(Throwable e) {
logger.error("scroll request [{}] failed, scrolling document(s) is stopped", e, scrollId);
finishHim(scrollId, hasTimedOut(), e);
}
});
} catch (Throwable t) {
logger.error("unable to execute scroll request [{}]", t, scrollId);
finishHim(scrollId, false, t);
}
}
void onBulkResponse(String scrollId, BulkResponse bulkResponse) { void onBulkResponse(String scrollId, BulkResponse bulkResponse) {
try { try {
for (BulkItemResponse item : bulkResponse.getItems()) { for (BulkItemResponse item : bulkResponse.getItems()) {