Merge branch 'master' into feature/query-refactoring
This commit is contained in:
commit
cc69de5c5f
|
@ -61,13 +61,7 @@ public class BootstrapForTesting {
|
||||||
try {
|
try {
|
||||||
JarHell.checkJarHell();
|
JarHell.checkJarHell();
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
if (Boolean.parseBoolean(System.getProperty("tests.maven"))) {
|
|
||||||
throw new RuntimeException("found jar hell in test classpath", e);
|
throw new RuntimeException("found jar hell in test classpath", e);
|
||||||
} else {
|
|
||||||
Loggers.getLogger(BootstrapForTesting.class)
|
|
||||||
.warn("Your ide or custom test runner has jar hell issues, " +
|
|
||||||
"you might want to look into that", e);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// make sure java.io.tmpdir exists always (in case code uses it in a static initializer)
|
// make sure java.io.tmpdir exists always (in case code uses it in a static initializer)
|
||||||
|
|
|
@ -60,17 +60,13 @@ SearchResponse scrollResp = client.prepareSearch(test)
|
||||||
.setQuery(qb)
|
.setQuery(qb)
|
||||||
.setSize(100).execute().actionGet(); //100 hits per shard will be returned for each scroll
|
.setSize(100).execute().actionGet(); //100 hits per shard will be returned for each scroll
|
||||||
//Scroll until no hits are returned
|
//Scroll until no hits are returned
|
||||||
while (true) {
|
do {
|
||||||
|
|
||||||
for (SearchHit hit : scrollResp.getHits().getHits()) {
|
for (SearchHit hit : scrollResp.getHits().getHits()) {
|
||||||
//Handle the hit...
|
//Handle the hit...
|
||||||
}
|
}
|
||||||
|
|
||||||
scrollResp = client.prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(600000)).execute().actionGet();
|
scrollResp = client.prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(600000)).execute().actionGet();
|
||||||
//Break condition: No hits are returned
|
} while(scrollResp.getHits().getHits().length != 0); // Zero hits mark the end of the scroll and the while loop.
|
||||||
if (scrollResp.getHits().getHits().length == 0) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
|
||||||
[[java-search-msearch]]
|
[[java-search-msearch]]
|
||||||
|
|
|
@ -1,34 +1,83 @@
|
||||||
[[analysis-compound-word-tokenfilter]]
|
[[analysis-compound-word-tokenfilter]]
|
||||||
=== Compound Word Token Filter
|
=== Compound Word Token Filter
|
||||||
|
|
||||||
Token filters that allow to decompose compound words. There are two
|
The `hyphenation_decompounder` and `dictionary_decompounder` token filters can
|
||||||
types available: `dictionary_decompounder` and
|
decompose compound words found in many German languages into word parts.
|
||||||
`hyphenation_decompounder`.
|
|
||||||
|
|
||||||
The following are settings that can be set for a compound word token
|
Both token filters require a dictionary of word parts, which can be provided
|
||||||
filter type:
|
as:
|
||||||
|
|
||||||
[cols="<,<",options="header",]
|
[horizontal]
|
||||||
|=======================================================================
|
`word_list`::
|
||||||
|Setting |Description
|
|
||||||
|`word_list` |A list of words to use.
|
|
||||||
|
|
||||||
|`word_list_path` |A path (either relative to `config` location, or
|
An array of words, specified inline in the token filter configuration, or
|
||||||
absolute) to a list of words.
|
|
||||||
|
|
||||||
|`hyphenation_patterns_path` |A path (either relative to `config` location, or
|
`word_list_path`::
|
||||||
absolute) to a FOP XML hyphenation pattern file. (See http://offo.sourceforge.net/hyphenation/)
|
|
||||||
Required for `hyphenation_decompounder`.
|
|
||||||
|
|
||||||
|`min_word_size` |Minimum word size(Integer). Defaults to 5.
|
The path (either absolute or relative to the `config` directory) to a UTF-8
|
||||||
|
encoded file containing one word per line.
|
||||||
|
|
||||||
|`min_subword_size` |Minimum subword size(Integer). Defaults to 2.
|
[float]
|
||||||
|
=== Hyphenation decompounder
|
||||||
|
|
||||||
|`max_subword_size` |Maximum subword size(Integer). Defaults to 15.
|
The `hyphenation_decompounder` uses hyphenation grammars to find potential
|
||||||
|
subwords that are then checked against the word dictionary. The quality of the
|
||||||
|
output tokens is directly connected to the quality of the grammar file you
|
||||||
|
use. For languages like German they are quite good.
|
||||||
|
|
||||||
|
XML based hyphenation grammar files can be found in the
|
||||||
|
http://offo.sourceforge.net/hyphenation/#FOP+XML+Hyphenation+Patterns[Objects For Formatting Objects]
|
||||||
|
(OFFO) Sourceforge project. You can download http://downloads.sourceforge.net/offo/offo-hyphenation.zip[offo-hyphenation.zip]
|
||||||
|
directly and look in the `offo-hyphenation/hyph/` directory.
|
||||||
|
Credits for the hyphenation code go to the Apache FOP project .
|
||||||
|
|
||||||
|
[float]
|
||||||
|
=== Dictionary decompounder
|
||||||
|
|
||||||
|
The `dictionary_decompounder` uses a brute force approach in conjuction with
|
||||||
|
only the word dictionary to find subwords in a compound word. It is much
|
||||||
|
slower than the hyphenation decompounder but can be used as a first start to
|
||||||
|
check the quality of your dictionary.
|
||||||
|
|
||||||
|
[float]
|
||||||
|
=== Compound token filter parameters
|
||||||
|
|
||||||
|
The following parameters can be used to configure a compound word token
|
||||||
|
filter:
|
||||||
|
|
||||||
|
[horizontal]
|
||||||
|
`type`::
|
||||||
|
|
||||||
|
Either `dictionary_decompounder` or `hyphenation_decompounder`.
|
||||||
|
|
||||||
|
`word_list`::
|
||||||
|
|
||||||
|
A array containing a list of words to use for the word dictionary.
|
||||||
|
|
||||||
|
`word_list_path`::
|
||||||
|
|
||||||
|
The path (either absolute or relative to the `config` directory) to the word dictionary.
|
||||||
|
|
||||||
|
`hyphenation_patterns_path`::
|
||||||
|
|
||||||
|
The path (either absolute or relative to the `config` directory) to a FOP XML hyphenation pattern file. (required for hyphenation)
|
||||||
|
|
||||||
|
`min_word_size`::
|
||||||
|
|
||||||
|
Minimum word size. Defaults to 5.
|
||||||
|
|
||||||
|
`min_subword_size`::
|
||||||
|
|
||||||
|
Minimum subword size. Defaults to 2.
|
||||||
|
|
||||||
|
`max_subword_size`::
|
||||||
|
|
||||||
|
Maximum subword size. Defaults to 15.
|
||||||
|
|
||||||
|
`only_longest_match`::
|
||||||
|
|
||||||
|
Whether to include only the longest matching subword or not. Defaults to `false`
|
||||||
|
|
||||||
|`only_longest_match` |Only matching the longest(Boolean). Defaults to
|
|
||||||
`false`
|
|
||||||
|=======================================================================
|
|
||||||
|
|
||||||
Here is an example:
|
Here is an example:
|
||||||
|
|
||||||
|
@ -48,5 +97,6 @@ index :
|
||||||
myTokenFilter2 :
|
myTokenFilter2 :
|
||||||
type : hyphenation_decompounder
|
type : hyphenation_decompounder
|
||||||
word_list_path: path/to/words.txt
|
word_list_path: path/to/words.txt
|
||||||
|
hyphenation_patterns_path: path/to/fop.xml
|
||||||
max_subword_size : 22
|
max_subword_size : 22
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
|
|
@ -56,7 +56,7 @@ newlines. Example:
|
||||||
$ cat requests
|
$ cat requests
|
||||||
{ "index" : { "_index" : "test", "_type" : "type1", "_id" : "1" } }
|
{ "index" : { "_index" : "test", "_type" : "type1", "_id" : "1" } }
|
||||||
{ "field1" : "value1" }
|
{ "field1" : "value1" }
|
||||||
$ curl -s -XPOST localhost:9200/_bulk --data-binary @requests; echo
|
$ curl -s -XPOST localhost:9200/_bulk --data-binary "@requests"; echo
|
||||||
{"took":7,"items":[{"create":{"_index":"test","_type":"type1","_id":"1","_version":1}}]}
|
{"took":7,"items":[{"create":{"_index":"test","_type":"type1","_id":"1","_version":1}}]}
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
|
||||||
|
|
|
@ -544,7 +544,7 @@ You can download the sample dataset (accounts.json) from https://github.com/bly2
|
||||||
|
|
||||||
[source,sh]
|
[source,sh]
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
curl -XPOST 'localhost:9200/bank/account/_bulk?pretty' --data-binary @accounts.json
|
curl -XPOST 'localhost:9200/bank/account/_bulk?pretty' --data-binary "@accounts.json"
|
||||||
curl 'localhost:9200/_cat/indices?v'
|
curl 'localhost:9200/_cat/indices?v'
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
|
||||||
|
@ -915,7 +915,7 @@ In SQL, the above aggregation is similar in concept to:
|
||||||
|
|
||||||
[source,sh]
|
[source,sh]
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
SELECT COUNT(*) from bank GROUP BY state ORDER BY COUNT(*) DESC
|
SELECT state, COUNT(*) FROM bank GROUP BY state ORDER BY COUNT(*) DESC
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
|
||||||
And the response (partially shown):
|
And the response (partially shown):
|
||||||
|
|
|
@ -70,8 +70,9 @@ Checking shards may take a lot of time on large indices.
|
||||||
|
|
||||||
[[index-codec]] `index.codec`::
|
[[index-codec]] `index.codec`::
|
||||||
|
|
||||||
experimental[] The `default` value compresses stored data with LZ4
|
experimental[] The +default+ value compresses stored data with LZ4
|
||||||
compression, but this can be set to `best_compression` for a higher
|
compression, but this can be set to +best_compression+
|
||||||
|
which uses https://en.wikipedia.org/wiki/DEFLATE[DEFLATE] for a higher
|
||||||
compression ratio, at the expense of slower stored fields performance.
|
compression ratio, at the expense of slower stored fields performance.
|
||||||
|
|
||||||
[float]
|
[float]
|
||||||
|
|
|
@ -8,7 +8,7 @@ _filter context_:
|
||||||
|
|
||||||
Query context::
|
Query context::
|
||||||
|
|
||||||
A query used in query context will caculated relevance scores and will not be
|
A query used in query context will calculate relevance scores and will not be
|
||||||
cacheable. Query context is used whenever filter context does not apply.
|
cacheable. Query context is used whenever filter context does not apply.
|
||||||
|
|
||||||
Filter context::
|
Filter context::
|
||||||
|
|
|
@ -35,7 +35,7 @@ $ cat requests
|
||||||
{"search_type" : "dfs_query_then_fetch"}
|
{"search_type" : "dfs_query_then_fetch"}
|
||||||
{"query" : {"match_all" : {}}}
|
{"query" : {"match_all" : {}}}
|
||||||
|
|
||||||
$ curl -XGET localhost:9200/_msearch --data-binary @requests; echo
|
$ curl -XGET localhost:9200/_msearch --data-binary "@requests"; echo
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
|
||||||
Note, the above includes an example of an empty header (can also be just
|
Note, the above includes an example of an empty header (can also be just
|
||||||
|
|
|
@ -366,7 +366,7 @@ Request:
|
||||||
|
|
||||||
[source,js]
|
[source,js]
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
curl -XGET 'localhost:9200/twitter/tweet/_mpercolate' --data-binary @requests.txt; echo
|
curl -XGET 'localhost:9200/twitter/tweet/_mpercolate' --data-binary "@requests.txt"; echo
|
||||||
--------------------------------------------------
|
--------------------------------------------------
|
||||||
|
|
||||||
The index `twitter` is the default index, and the type `tweet` is the default type and will be used in the case a header
|
The index `twitter` is the default index, and the type `tweet` is the default type and will be used in the case a header
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
7ff51040bbcc9085dcb9a24a2c2a3cc7ac995988
|
|
|
@ -0,0 +1 @@
|
||||||
|
b53f650323b7242dcced25b679f3e9aa4b494da5
|
|
@ -1 +0,0 @@
|
||||||
b0712cc659e72b9da0f5b03872d2476ab4a695f7
|
|
|
@ -0,0 +1 @@
|
||||||
|
50ba7eb31719be1260bdae51cf69340df2d91ec4
|
|
@ -16,7 +16,6 @@
|
||||||
|
|
||||||
<properties>
|
<properties>
|
||||||
<elasticsearch.plugin.classname>org.elasticsearch.plugin.discovery.ec2.Ec2DiscoveryPlugin</elasticsearch.plugin.classname>
|
<elasticsearch.plugin.classname>org.elasticsearch.plugin.discovery.ec2.Ec2DiscoveryPlugin</elasticsearch.plugin.classname>
|
||||||
<amazonaws.version>1.10.12</amazonaws.version>
|
|
||||||
<tests.jvms>1</tests.jvms>
|
<tests.jvms>1</tests.jvms>
|
||||||
<tests.rest.suite>discovery_ec2</tests.rest.suite>
|
<tests.rest.suite>discovery_ec2</tests.rest.suite>
|
||||||
<tests.rest.load_packaged>false</tests.rest.load_packaged>
|
<tests.rest.load_packaged>false</tests.rest.load_packaged>
|
||||||
|
|
|
@ -26,6 +26,7 @@
|
||||||
<elasticsearch.plugin.jvm>true</elasticsearch.plugin.jvm>
|
<elasticsearch.plugin.jvm>true</elasticsearch.plugin.jvm>
|
||||||
<elasticsearch.plugin.isolated>true</elasticsearch.plugin.isolated>
|
<elasticsearch.plugin.isolated>true</elasticsearch.plugin.isolated>
|
||||||
<elasticsearch.plugin.site>false</elasticsearch.plugin.site>
|
<elasticsearch.plugin.site>false</elasticsearch.plugin.site>
|
||||||
|
<amazonaws.version>1.10.19</amazonaws.version>
|
||||||
</properties>
|
</properties>
|
||||||
|
|
||||||
<dependencies>
|
<dependencies>
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
7ff51040bbcc9085dcb9a24a2c2a3cc7ac995988
|
|
|
@ -0,0 +1 @@
|
||||||
|
b53f650323b7242dcced25b679f3e9aa4b494da5
|
|
@ -1 +0,0 @@
|
||||||
31afbe46b65e9933316c7e8dfb8b88dc4b37b6ba
|
|
|
@ -0,0 +1 @@
|
||||||
|
c8764f3e61a3c420db429870ec22b31fe755d81d
|
|
@ -1 +0,0 @@
|
||||||
c9e2593fdf398c5f8906a704db037d17b2de4b2a
|
|
|
@ -0,0 +1 @@
|
||||||
|
a23dc60d56d54126250c23cab1d01328b1e83678
|
|
@ -16,7 +16,6 @@
|
||||||
|
|
||||||
<properties>
|
<properties>
|
||||||
<elasticsearch.plugin.classname>org.elasticsearch.plugin.repository.s3.S3RepositoryPlugin</elasticsearch.plugin.classname>
|
<elasticsearch.plugin.classname>org.elasticsearch.plugin.repository.s3.S3RepositoryPlugin</elasticsearch.plugin.classname>
|
||||||
<amazonaws.version>1.10.12</amazonaws.version>
|
|
||||||
<tests.jvms>1</tests.jvms>
|
<tests.jvms>1</tests.jvms>
|
||||||
<tests.rest.suite>repository_s3</tests.rest.suite>
|
<tests.rest.suite>repository_s3</tests.rest.suite>
|
||||||
<tests.rest.load_packaged>false</tests.rest.load_packaged>
|
<tests.rest.load_packaged>false</tests.rest.load_packaged>
|
||||||
|
|
Loading…
Reference in New Issue