Merge branch 'master' into feature/query-refactoring

This commit is contained in:
Christoph Büscher 2015-09-21 15:38:43 +02:00
commit cc69de5c5f
22 changed files with 90 additions and 50 deletions

View File

@ -61,13 +61,7 @@ public class BootstrapForTesting {
try {
JarHell.checkJarHell();
} catch (Exception e) {
if (Boolean.parseBoolean(System.getProperty("tests.maven"))) {
throw new RuntimeException("found jar hell in test classpath", e);
} else {
Loggers.getLogger(BootstrapForTesting.class)
.warn("Your ide or custom test runner has jar hell issues, " +
"you might want to look into that", e);
}
}
// make sure java.io.tmpdir exists always (in case code uses it in a static initializer)

View File

@ -60,17 +60,13 @@ SearchResponse scrollResp = client.prepareSearch(test)
.setQuery(qb)
.setSize(100).execute().actionGet(); //100 hits per shard will be returned for each scroll
//Scroll until no hits are returned
while (true) {
do {
for (SearchHit hit : scrollResp.getHits().getHits()) {
//Handle the hit...
}
scrollResp = client.prepareSearchScroll(scrollResp.getScrollId()).setScroll(new TimeValue(600000)).execute().actionGet();
//Break condition: No hits are returned
if (scrollResp.getHits().getHits().length == 0) {
break;
}
}
} while(scrollResp.getHits().getHits().length != 0); // Zero hits mark the end of the scroll and the while loop.
--------------------------------------------------
[[java-search-msearch]]

View File

@ -1,34 +1,83 @@
[[analysis-compound-word-tokenfilter]]
=== Compound Word Token Filter
Token filters that allow to decompose compound words. There are two
types available: `dictionary_decompounder` and
`hyphenation_decompounder`.
The `hyphenation_decompounder` and `dictionary_decompounder` token filters can
decompose compound words found in many German languages into word parts.
The following are settings that can be set for a compound word token
filter type:
Both token filters require a dictionary of word parts, which can be provided
as:
[cols="<,<",options="header",]
|=======================================================================
|Setting |Description
|`word_list` |A list of words to use.
[horizontal]
`word_list`::
|`word_list_path` |A path (either relative to `config` location, or
absolute) to a list of words.
An array of words, specified inline in the token filter configuration, or
|`hyphenation_patterns_path` |A path (either relative to `config` location, or
absolute) to a FOP XML hyphenation pattern file. (See http://offo.sourceforge.net/hyphenation/)
Required for `hyphenation_decompounder`.
`word_list_path`::
|`min_word_size` |Minimum word size(Integer). Defaults to 5.
The path (either absolute or relative to the `config` directory) to a UTF-8
encoded file containing one word per line.
|`min_subword_size` |Minimum subword size(Integer). Defaults to 2.
[float]
=== Hyphenation decompounder
|`max_subword_size` |Maximum subword size(Integer). Defaults to 15.
The `hyphenation_decompounder` uses hyphenation grammars to find potential
subwords that are then checked against the word dictionary. The quality of the
output tokens is directly connected to the quality of the grammar file you
use. For languages like German they are quite good.
XML based hyphenation grammar files can be found in the
http://offo.sourceforge.net/hyphenation/#FOP+XML+Hyphenation+Patterns[Objects For Formatting Objects]
(OFFO) Sourceforge project. You can download http://downloads.sourceforge.net/offo/offo-hyphenation.zip[offo-hyphenation.zip]
directly and look in the `offo-hyphenation/hyph/` directory.
Credits for the hyphenation code go to the Apache FOP project .
[float]
=== Dictionary decompounder
The `dictionary_decompounder` uses a brute force approach in conjuction with
only the word dictionary to find subwords in a compound word. It is much
slower than the hyphenation decompounder but can be used as a first start to
check the quality of your dictionary.
[float]
=== Compound token filter parameters
The following parameters can be used to configure a compound word token
filter:
[horizontal]
`type`::
Either `dictionary_decompounder` or `hyphenation_decompounder`.
`word_list`::
A array containing a list of words to use for the word dictionary.
`word_list_path`::
The path (either absolute or relative to the `config` directory) to the word dictionary.
`hyphenation_patterns_path`::
The path (either absolute or relative to the `config` directory) to a FOP XML hyphenation pattern file. (required for hyphenation)
`min_word_size`::
Minimum word size. Defaults to 5.
`min_subword_size`::
Minimum subword size. Defaults to 2.
`max_subword_size`::
Maximum subword size. Defaults to 15.
`only_longest_match`::
Whether to include only the longest matching subword or not. Defaults to `false`
|`only_longest_match` |Only matching the longest(Boolean). Defaults to
`false`
|=======================================================================
Here is an example:
@ -48,5 +97,6 @@ index :
myTokenFilter2 :
type : hyphenation_decompounder
word_list_path: path/to/words.txt
hyphenation_patterns_path: path/to/fop.xml
max_subword_size : 22
--------------------------------------------------

View File

@ -56,7 +56,7 @@ newlines. Example:
$ cat requests
{ "index" : { "_index" : "test", "_type" : "type1", "_id" : "1" } }
{ "field1" : "value1" }
$ curl -s -XPOST localhost:9200/_bulk --data-binary @requests; echo
$ curl -s -XPOST localhost:9200/_bulk --data-binary "@requests"; echo
{"took":7,"items":[{"create":{"_index":"test","_type":"type1","_id":"1","_version":1}}]}
--------------------------------------------------

View File

@ -544,7 +544,7 @@ You can download the sample dataset (accounts.json) from https://github.com/bly2
[source,sh]
--------------------------------------------------
curl -XPOST 'localhost:9200/bank/account/_bulk?pretty' --data-binary @accounts.json
curl -XPOST 'localhost:9200/bank/account/_bulk?pretty' --data-binary "@accounts.json"
curl 'localhost:9200/_cat/indices?v'
--------------------------------------------------
@ -915,7 +915,7 @@ In SQL, the above aggregation is similar in concept to:
[source,sh]
--------------------------------------------------
SELECT COUNT(*) from bank GROUP BY state ORDER BY COUNT(*) DESC
SELECT state, COUNT(*) FROM bank GROUP BY state ORDER BY COUNT(*) DESC
--------------------------------------------------
And the response (partially shown):

View File

@ -70,8 +70,9 @@ Checking shards may take a lot of time on large indices.
[[index-codec]] `index.codec`::
experimental[] The `default` value compresses stored data with LZ4
compression, but this can be set to `best_compression` for a higher
experimental[] The +default+ value compresses stored data with LZ4
compression, but this can be set to +best_compression+
which uses https://en.wikipedia.org/wiki/DEFLATE[DEFLATE] for a higher
compression ratio, at the expense of slower stored fields performance.
[float]

View File

@ -8,7 +8,7 @@ _filter context_:
Query context::
A query used in query context will caculated relevance scores and will not be
A query used in query context will calculate relevance scores and will not be
cacheable. Query context is used whenever filter context does not apply.
Filter context::

View File

@ -35,7 +35,7 @@ $ cat requests
{"search_type" : "dfs_query_then_fetch"}
{"query" : {"match_all" : {}}}
$ curl -XGET localhost:9200/_msearch --data-binary @requests; echo
$ curl -XGET localhost:9200/_msearch --data-binary "@requests"; echo
--------------------------------------------------
Note, the above includes an example of an empty header (can also be just

View File

@ -366,7 +366,7 @@ Request:
[source,js]
--------------------------------------------------
curl -XGET 'localhost:9200/twitter/tweet/_mpercolate' --data-binary @requests.txt; echo
curl -XGET 'localhost:9200/twitter/tweet/_mpercolate' --data-binary "@requests.txt"; echo
--------------------------------------------------
The index `twitter` is the default index, and the type `tweet` is the default type and will be used in the case a header

View File

@ -1 +0,0 @@
7ff51040bbcc9085dcb9a24a2c2a3cc7ac995988

View File

@ -0,0 +1 @@
b53f650323b7242dcced25b679f3e9aa4b494da5

View File

@ -1 +0,0 @@
b0712cc659e72b9da0f5b03872d2476ab4a695f7

View File

@ -0,0 +1 @@
50ba7eb31719be1260bdae51cf69340df2d91ec4

View File

@ -16,7 +16,6 @@
<properties>
<elasticsearch.plugin.classname>org.elasticsearch.plugin.discovery.ec2.Ec2DiscoveryPlugin</elasticsearch.plugin.classname>
<amazonaws.version>1.10.12</amazonaws.version>
<tests.jvms>1</tests.jvms>
<tests.rest.suite>discovery_ec2</tests.rest.suite>
<tests.rest.load_packaged>false</tests.rest.load_packaged>

View File

@ -26,6 +26,7 @@
<elasticsearch.plugin.jvm>true</elasticsearch.plugin.jvm>
<elasticsearch.plugin.isolated>true</elasticsearch.plugin.isolated>
<elasticsearch.plugin.site>false</elasticsearch.plugin.site>
<amazonaws.version>1.10.19</amazonaws.version>
</properties>
<dependencies>

View File

@ -1 +0,0 @@
7ff51040bbcc9085dcb9a24a2c2a3cc7ac995988

View File

@ -0,0 +1 @@
b53f650323b7242dcced25b679f3e9aa4b494da5

View File

@ -1 +0,0 @@
31afbe46b65e9933316c7e8dfb8b88dc4b37b6ba

View File

@ -0,0 +1 @@
c8764f3e61a3c420db429870ec22b31fe755d81d

View File

@ -1 +0,0 @@
c9e2593fdf398c5f8906a704db037d17b2de4b2a

View File

@ -0,0 +1 @@
a23dc60d56d54126250c23cab1d01328b1e83678

View File

@ -16,7 +16,6 @@
<properties>
<elasticsearch.plugin.classname>org.elasticsearch.plugin.repository.s3.S3RepositoryPlugin</elasticsearch.plugin.classname>
<amazonaws.version>1.10.12</amazonaws.version>
<tests.jvms>1</tests.jvms>
<tests.rest.suite>repository_s3</tests.rest.suite>
<tests.rest.load_packaged>false</tests.rest.load_packaged>