Merge pull request #12994 from jpountz/deprecate/scan

Deprecate the `scan` search type.
This commit is contained in:
Adrien Grand 2015-08-20 12:47:45 +02:00
commit 41d8fbe8f5
13 changed files with 121 additions and 121 deletions

View File

@ -54,7 +54,9 @@ public enum SearchType {
/**
* Performs scanning of the results which executes the search without any sorting.
* It will automatically start scrolling the result set.
* @deprecated will be removed in 3.0, you should do a regular scroll instead, ordered by `_doc`
*/
@Deprecated
SCAN((byte) 4),
/**
* Only counts the results, will still execute aggregations and the like.
@ -69,6 +71,7 @@ public enum SearchType {
public static final SearchType DEFAULT = QUERY_THEN_FETCH;
private static final ParseField COUNT_VALUE = new ParseField("count").withAllDeprecated("query_then_fetch");
private static final ParseField SCAN_VALUE = new ParseField("scan").withAllDeprecated("query_then_fetch sorting on `_doc`");
private byte id;
@ -121,7 +124,7 @@ public enum SearchType {
return SearchType.QUERY_THEN_FETCH;
} else if ("query_and_fetch".equals(searchType)) {
return SearchType.QUERY_AND_FETCH;
} else if ("scan".equals(searchType)) {
} else if (parseFieldMatcher.match(searchType, SCAN_VALUE)) {
return SearchType.SCAN;
} else if (parseFieldMatcher.match(searchType, COUNT_VALUE)) {
return SearchType.COUNT;

View File

@ -40,6 +40,7 @@ import org.elasticsearch.threadpool.ThreadPool;
import static org.elasticsearch.action.search.type.TransportSearchHelper.buildScrollId;
@Deprecated // remove in 3.0
public class TransportSearchScanAction extends TransportSearchTypeAction {
@Inject

View File

@ -263,6 +263,7 @@ public class SearchService extends AbstractLifecycleComponent<SearchService> {
}
}
@Deprecated // remove in 3.0
public QuerySearchResult executeScan(ShardSearchRequest request) {
final SearchContext context = createAndPutContext(request);
final int originalSize = context.size();

View File

@ -418,6 +418,7 @@ public class SearchServiceTransportAction extends AbstractComponent {
}
}
@Deprecated // remove in 3.0
class SearchScanTransportHandler implements TransportRequestHandler<ShardSearchTransportRequest> {
@Override
public void messageReceived(ShardSearchTransportRequest request, TransportChannel channel) throws Exception {

View File

@ -6,7 +6,7 @@ The delete-by-query plugin adds support for deleting all of the documents
replacement for the problematic _delete-by-query_ functionality which has been
removed from Elasticsearch core.
Internally, it uses the {ref}/search-request-scroll.html#scroll-scan[Scan/Scroll]
Internally, it uses {ref}/search-request-scroll.html[Scroll]
and {ref}/docs-bulk.html[Bulk] APIs to delete documents in an efficient and
safe manner. It is slower than the old _delete-by-query_ functionality, but
fixes the problems with the previous implementation.
@ -101,7 +101,7 @@ See {ref}/search-uri-request.html[URI search request] for details.
`size`::
The number of hits returned *per shard* by the {ref}/search-request-scroll.html#scroll-scan[scan]
The number of hits returned by the {ref}/search-request-scroll.html[scroll]
request. Defaults to 10. May also be specified in the request body.
`timeout`::
@ -148,7 +148,7 @@ The JSON response looks like this:
--------------------------------------------------
Internally, the query is used to execute an initial
{ref}/search-request-scroll.html#scroll-scan[scroll/scan] request. As hits are
{ref}/search-request-scroll.html[scroll] request. As hits are
pulled from the scroll API, they are passed to the {ref}/docs-bulk.html[Bulk
API] for deletion.
@ -157,7 +157,7 @@ was visible to search at the time the request was executed. Any documents
that have been reindexed or updated during execution will not be deleted.
Since documents can be updated or deleted by external operations during the
_scan-scroll-bulk_ process, the plugin keeps track of different counters for
_scroll-bulk_ process, the plugin keeps track of different counters for
each index, with the totals displayed under the `_all` index. The counters
are as follows:
@ -212,7 +212,7 @@ Resiliency::
=== New delete-by-query implementation
The new implementation, provided by this plugin, is built internally
using {ref}/search-request-scroll.html#scroll-scan[scan and scroll] to return
using {ref}/search-request-scroll.html[scroll] to return
the document IDs and versions of all the documents that need to be deleted.
It then uses the {ref}/docs-bulk.html[`bulk` API] to do the actual deletion.
@ -231,8 +231,8 @@ try-once::
syntactic sugar::
A delete-by-query is equivalent to a scan/scroll search and corresponding
bulk-deletes by ID.
A delete-by-query is equivalent to a scroll search ordered by `_doc` and
corresponding bulk-deletes by ID.
point-in-time::
@ -267,4 +267,4 @@ move the functionality to a plugin instead of replacing the feautre in core:
* There is currently no way to monitor or cancel a running delete-by-query
request, except for the `timeout` parameter.
We have plans to solve both of these issues in a later version of Elasticsearch.
We have plans to solve both of these issues in a later version of Elasticsearch.

View File

@ -16,6 +16,7 @@ As a general rule:
See <<setup-upgrade>> for more info.
--
include::migrate_2_1.asciidoc[]
include::migrate_2_0.asciidoc[]

View File

@ -27,8 +27,8 @@ The old delete-by-query functionality was fast but unsafe. It could lead to
document differences between the primary and replica shards, and could even
produce out of memory exceptions and cause the cluster to crash.
This feature has been reimplemented using the <<scroll-scan,scroll/scan>> and
the <<docs-bulk,`bulk`>> API, which may be slower for queries which match
This feature has been reimplemented using the <<search-request-scroll,scroll>> and
<<docs-bulk,`bulk`>> APIs, which may be slower for queries which match
large numbers of documents, but is safe.
Currently, a long running delete-by-query job cannot be cancelled, which is

View File

@ -0,0 +1,27 @@
[[breaking-changes-2.1]]
== Breaking changes in 2.1
This section discusses the changes that you need to be aware of when migrating
your application to Elasticsearch 2.1.
=== Search changes
==== `search_type=scan` deprecated
The `scan` search type has been deprecated. All benefits from this search
type can now be achieved by doing a scroll request that sorts documents in
`_doc` order, for instance:
[source,sh]
---------------
GET /my_index/_search?scroll=2m
{
"sort": [
"_doc"
]
}
---------------
Scroll requests sorted by `_doc` have been optimized to more efficiently resume
from where the previous request stopped, so this will have the same performance
characteristics as the former `scan` search type.

View File

@ -64,7 +64,7 @@ And here is a sample response:
`search_type`::
The type of the search operation to perform. Can be
`dfs_query_then_fetch`, `query_then_fetch`, or 'scan'.
`dfs_query_then_fetch` or `query_then_fetch`.
Defaults to `query_then_fetch`.
See <<search-request-search-type,_Search Type_>> for more.

View File

@ -90,59 +90,20 @@ used.
NOTE: If the request specifies aggregations, only the initial search response
will contain the aggregations results.
[[scroll-scan]]
==== Efficient scrolling with Scroll-Scan
Deep pagination with <<search-request-from-size,`from` and `size`>> -- e.g.
`?size=10&from=10000` -- is very inefficient as (in this example) 100,000
sorted results have to be retrieved from each shard and resorted in order to
return just 10 results. This process has to be repeated for every page
requested.
The `scroll` API keeps track of which results have already been returned and
so is able to return sorted results more efficiently than with deep
pagination. However, sorting results (which happens by default) still has a
cost.
Normally, you just want to retrieve all results and the order doesn't matter.
Scrolling can be combined with the <<scan,`scan`>> search type to disable
any scoring or sorting and to return results in the most efficient way
possible. All that is needed is to add `search_type=scan` to the query string
of the initial search request:
NOTE: Scroll requests have optimizations that make them faster when the sort
order is `_doc`. If you want to iterate over all documents regardless of the
order, this is the most efficient option:
[source,js]
--------------------------------------------------
curl 'localhost:9200/twitter/tweet/_search?scroll=1m&search_type=scan' <1> -d '
curl -XGET 'localhost:9200/_search?scroll=1m' -d '
{
"query": {
"match" : {
"title" : "elasticsearch"
}
}
"sort": [
"_doc"
}
}
'
--------------------------------------------------
<1> Setting `search_type` to `scan` disables sorting and makes scrolling
very efficient.
A scanning scroll request differs from a standard scroll request in four
ways:
* No score is calculated and sorting is disabled. Results are returned in
the order they appear in the index.
* Aggregations are not supported.
* The response of the initial `search` request will not contain any results in
the `hits` array. The first results will be returned by the first `scroll`
request.
* The <<search-request-from-size,`size` parameter>> controls the number of
results *per shard*, not per request, so a `size` of `10` which hits 5
shards will return a maximum of 50 results per `scroll` request.
If you want the scoring to happen, even without sorting on it, set the
`track_scores` parameter to `true`.
[[scroll-search-context]]
==== Keeping the search context alive

View File

@ -26,8 +26,8 @@ each shard using these global frequencies.
Also, because of the need to sort the results, getting back a large
document set, or even scrolling it, while maintaining the correct sorting
behavior can be a very expensive operation. For large result set
scrolling without sorting, the `scan` search type (explained below) is
also available.
scrolling, it is best to sort by `_doc` if the order in which documents
are returned is not important.
Elasticsearch is very flexible and allows to control the type of search
to execute on a *per search request* basis. The type can be configured
@ -77,9 +77,11 @@ API as it provides more options.
[[scan]]
==== Scan
deprecated[2.1.0, `scan` does not provide any benefits over a regular `scroll` request sorted by `_doc`]
Parameter value: *scan*.
The `scan` search type disables sorting in order to allow very efficient
scrolling through large result sets. See <<scroll-scan>> for more.
scrolling through large result sets.

View File

@ -103,7 +103,9 @@ Defaults to no terminate_after.
|`size` |The number of hits to return. Defaults to `10`.
|`search_type` |The type of the search operation to perform. Can be
`dfs_query_then_fetch`, `query_then_fetch`, `scan` or `count`
`dfs_query_then_fetch`, `query_then_fetch`, `scan`
deprecated[2.1.0,Replaced by a regular `scroll` sorted by `_doc`]
or `count`
deprecated[2.0.0-beta1,Replaced by `size: 0`]. Defaults to `query_then_fetch`. See
<<search-request-search-type,_Search Type_>> for
more details on the different types of search that can be performed.

View File

@ -102,12 +102,17 @@ public class TransportDeleteByQueryAction extends HandledTransportAction<DeleteB
void executeScan() {
try {
final SearchRequest scanRequest = new SearchRequest(request.indices()).types(request.types()).indicesOptions(request.indicesOptions());
scanRequest.searchType(SearchType.SCAN).scroll(request.scroll());
scanRequest.scroll(request.scroll());
if (request.routing() != null) {
scanRequest.routing(request.routing());
}
SearchSourceBuilder source = new SearchSourceBuilder().query(request.source()).fields("_routing", "_parent").fetchSource(false).version(true);
SearchSourceBuilder source = new SearchSourceBuilder()
.query(request.source())
.fields("_routing", "_parent")
.sort("_doc") // important for performance
.fetchSource(false)
.version(true);
if (request.size() > 0) {
source.size(request.size());
}
@ -121,17 +126,9 @@ public class TransportDeleteByQueryAction extends HandledTransportAction<DeleteB
@Override
public void onResponse(SearchResponse searchResponse) {
long hits = searchResponse.getHits().getTotalHits();
logger.trace("scan request executed: found [{}] document(s) to delete", hits);
addShardFailures(searchResponse.getShardFailures());
if (hits == 0) {
finishHim(searchResponse.getScrollId(), false, null);
return;
}
logger.trace("first request executed: found [{}] document(s) to delete", hits);
total.set(hits);
logger.trace("start scrolling [{}] document(s)", hits);
executeScroll(searchResponse.getScrollId());
deleteHits(null, searchResponse);
}
@Override
@ -151,53 +148,7 @@ public class TransportDeleteByQueryAction extends HandledTransportAction<DeleteB
scrollAction.execute(new SearchScrollRequest(scrollId).scroll(request.scroll()), new ActionListener<SearchResponse>() {
@Override
public void onResponse(SearchResponse scrollResponse) {
final SearchHit[] docs = scrollResponse.getHits().getHits();
final String nextScrollId = scrollResponse.getScrollId();
addShardFailures(scrollResponse.getShardFailures());
if (logger.isTraceEnabled()) {
logger.trace("scroll request [{}] executed: [{}] document(s) returned", scrollId, docs.length);
}
if ((docs.length == 0) || (nextScrollId == null)) {
logger.trace("scrolling documents terminated");
finishHim(scrollId, false, null);
return;
}
if (hasTimedOut()) {
logger.trace("scrolling documents timed out");
finishHim(scrollId, true, null);
return;
}
// Delete the scrolled documents using the Bulk API
BulkRequest bulkRequest = new BulkRequest();
for (SearchHit doc : docs) {
DeleteRequest delete = new DeleteRequest(doc.index(), doc.type(), doc.id()).version(doc.version());
SearchHitField routing = doc.field("_routing");
if (routing != null) {
delete.routing((String) routing.value());
}
SearchHitField parent = doc.field("_parent");
if (parent != null) {
delete.parent((String) parent.value());
}
bulkRequest.add(delete);
}
logger.trace("executing bulk request with [{}] deletions", bulkRequest.numberOfActions());
client.bulk(bulkRequest, new ActionListener<BulkResponse>() {
@Override
public void onResponse(BulkResponse bulkResponse) {
onBulkResponse(nextScrollId, bulkResponse);
}
@Override
public void onFailure(Throwable e) {
onBulkFailure(nextScrollId, docs, e);
}
});
deleteHits(scrollId, scrollResponse);
}
@Override
@ -212,6 +163,56 @@ public class TransportDeleteByQueryAction extends HandledTransportAction<DeleteB
}
}
void deleteHits(String scrollId, SearchResponse scrollResponse) {
final SearchHit[] docs = scrollResponse.getHits().getHits();
final String nextScrollId = scrollResponse.getScrollId();
addShardFailures(scrollResponse.getShardFailures());
if (logger.isTraceEnabled()) {
logger.trace("scroll request [{}] executed: [{}] document(s) returned", scrollId, docs.length);
}
if ((docs.length == 0) || (nextScrollId == null)) {
logger.trace("scrolling documents terminated");
finishHim(scrollId, false, null);
return;
}
if (hasTimedOut()) {
logger.trace("scrolling documents timed out");
finishHim(scrollId, true, null);
return;
}
// Delete the scrolled documents using the Bulk API
BulkRequest bulkRequest = new BulkRequest();
for (SearchHit doc : docs) {
DeleteRequest delete = new DeleteRequest(doc.index(), doc.type(), doc.id()).version(doc.version());
SearchHitField routing = doc.field("_routing");
if (routing != null) {
delete.routing((String) routing.value());
}
SearchHitField parent = doc.field("_parent");
if (parent != null) {
delete.parent((String) parent.value());
}
bulkRequest.add(delete);
}
logger.trace("executing bulk request with [{}] deletions", bulkRequest.numberOfActions());
client.bulk(bulkRequest, new ActionListener<BulkResponse>() {
@Override
public void onResponse(BulkResponse bulkResponse) {
onBulkResponse(nextScrollId, bulkResponse);
}
@Override
public void onFailure(Throwable e) {
onBulkFailure(nextScrollId, docs, e);
}
});
}
void onBulkResponse(String scrollId, BulkResponse bulkResponse) {
try {
for (BulkItemResponse item : bulkResponse.getItems()) {