Added X Versions of NGramTokenFilter and NGramTokenizer to ElasticSearch. These versions
don't produce broken positions anymore and prevent certain highlighter bugs that fail with StringArrayOutOfBoundsExceptions as in #2931 This commit breaks backwards compatibility in terms of highlighting when NGramTokenFilter is used. The highlighter will highlight the entire terms as produced by the tokenizer instead of the individual sub-gram. To do sub-gram highlighting, the ngram tokenizer should be used. This behavior was based on broken NGramTokenFilter behavior which will be fixed in Lucene 4.4 but was ported in this commit to elasticsearch 0.90. The broken behavior can still be used if a version < LUCENE_42 is used in the token filter mapping. Closes #2931
This commit is contained in:
parent
f09ad507a4
commit
bd7ff6946e
|
@ -0,0 +1,187 @@
|
|||
package org.apache.lucene.analysis.ngram;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import org.apache.lucene.analysis.TokenFilter;
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.miscellaneous.LengthFilter;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
|
||||
/**
|
||||
* Tokenizes the input into n-grams of the given size(s).
|
||||
* <a name="version"/>
|
||||
* <p>You must specify the required {@link Version} compatibility when
|
||||
* creating a {@link NGramTokenFilter}. As of Lucene 4.4, this token filters:<ul>
|
||||
* <li>emits all n-grams for the same token at the same position,</li>
|
||||
* <li>does not modify offsets,</li>
|
||||
* <li>sorts n-grams by their offset in the original token first, then
|
||||
* increasing length (meaning that "abc" will give "a", "ab", "abc", "b", "bc",
|
||||
* "c").</li></ul>
|
||||
* <p>You can make this filter use the old behavior by providing a version <
|
||||
* {@link Version#LUCENE_44} in the constructor but this is not recommended as
|
||||
* it will lead to broken {@link TokenStream}s that will cause highlighting
|
||||
* bugs.
|
||||
*/
|
||||
public final class XNGramTokenFilter extends TokenFilter {
|
||||
|
||||
static {
|
||||
// LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1476563
|
||||
assert Lucene.VERSION.ordinal() < Version.LUCENE_42.ordinal()+2 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this should can be removed";
|
||||
}
|
||||
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
|
||||
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
||||
|
||||
private final int minGram, maxGram;
|
||||
|
||||
private char[] curTermBuffer;
|
||||
private int curTermLength;
|
||||
private int curGramSize;
|
||||
private int curPos;
|
||||
private int curPosInc, curPosLen;
|
||||
private int tokStart;
|
||||
private int tokEnd;
|
||||
private boolean hasIllegalOffsets; // only if the length changed before this filter
|
||||
|
||||
private final Version version;
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt;
|
||||
private final PositionLengthAttribute posLenAtt;
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
/**
|
||||
* Creates NGramTokenFilter with given min and max n-grams.
|
||||
* @param version Lucene version to enable correct position increments.
|
||||
* See <a href="#version">above</a> for details.
|
||||
* @param input {@link TokenStream} holding the input to be tokenized
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public XNGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) {
|
||||
super(new LengthFilter(true, input, minGram, Integer.MAX_VALUE));
|
||||
this.version = version;
|
||||
if (minGram < 1) {
|
||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||
}
|
||||
if (minGram > maxGram) {
|
||||
throw new IllegalArgumentException("minGram must not be greater than maxGram");
|
||||
}
|
||||
this.minGram = minGram;
|
||||
this.maxGram = maxGram;
|
||||
if (version.onOrAfter(Version.LUCENE_42)) {
|
||||
posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||
} else {
|
||||
posIncAtt = new PositionIncrementAttribute() {
|
||||
@Override
|
||||
public void setPositionIncrement(int positionIncrement) {}
|
||||
@Override
|
||||
public int getPositionIncrement() {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
posLenAtt = new PositionLengthAttribute() {
|
||||
@Override
|
||||
public void setPositionLength(int positionLength) {}
|
||||
@Override
|
||||
public int getPositionLength() {
|
||||
return 0;
|
||||
}
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates NGramTokenFilter with default min and max n-grams.
|
||||
* @param version Lucene version to enable correct position increments.
|
||||
* See <a href="#version">above</a> for details.
|
||||
* @param input {@link TokenStream} holding the input to be tokenized
|
||||
*/
|
||||
public XNGramTokenFilter(Version version, TokenStream input) {
|
||||
this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
@Override
|
||||
public final boolean incrementToken() throws IOException {
|
||||
while (true) {
|
||||
if (curTermBuffer == null) {
|
||||
if (!input.incrementToken()) {
|
||||
return false;
|
||||
} else {
|
||||
curTermBuffer = termAtt.buffer().clone();
|
||||
curTermLength = termAtt.length();
|
||||
curGramSize = minGram;
|
||||
curPos = 0;
|
||||
curPosInc = posIncAtt.getPositionIncrement();
|
||||
curPosLen = posLenAtt.getPositionLength();
|
||||
tokStart = offsetAtt.startOffset();
|
||||
tokEnd = offsetAtt.endOffset();
|
||||
// if length by start + end offsets doesn't match the term text then assume
|
||||
// this is a synonym and don't adjust the offsets.
|
||||
hasIllegalOffsets = (tokStart + curTermLength) != tokEnd;
|
||||
}
|
||||
}
|
||||
if (version.onOrAfter(Version.LUCENE_42)) {
|
||||
if (curGramSize > maxGram || curPos + curGramSize > curTermLength) {
|
||||
++curPos;
|
||||
curGramSize = minGram;
|
||||
}
|
||||
if (curPos + curGramSize <= curTermLength) {
|
||||
clearAttributes();
|
||||
termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
|
||||
posIncAtt.setPositionIncrement(curPosInc);
|
||||
curPosInc = 0;
|
||||
posLenAtt.setPositionLength(curPosLen);
|
||||
offsetAtt.setOffset(tokStart, tokEnd);
|
||||
curGramSize++;
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
while (curGramSize <= maxGram) {
|
||||
while (curPos+curGramSize <= curTermLength) { // while there is input
|
||||
clearAttributes();
|
||||
termAtt.copyBuffer(curTermBuffer, curPos, curGramSize);
|
||||
if (hasIllegalOffsets) {
|
||||
offsetAtt.setOffset(tokStart, tokEnd);
|
||||
} else {
|
||||
offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize);
|
||||
}
|
||||
curPos++;
|
||||
return true;
|
||||
}
|
||||
curGramSize++; // increase n-gram size
|
||||
curPos = 0;
|
||||
}
|
||||
}
|
||||
curTermBuffer = null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
curTermBuffer = null;
|
||||
}
|
||||
}
|
|
@ -0,0 +1,180 @@
|
|||
package org.apache.lucene.analysis.ngram;
|
||||
|
||||
/*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership.
|
||||
* The ASF licenses this file to You under the Apache License, Version 2.0
|
||||
* (the "License"); you may not use this file except in compliance with
|
||||
* the License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.Reader;
|
||||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
|
||||
import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.elasticsearch.common.lucene.Lucene;
|
||||
|
||||
/**
|
||||
* Tokenizes the input into n-grams of the given size(s).
|
||||
* <p>On the contrary to {@link NGramTokenFilter}, this class sets offsets so
|
||||
* that characters between startOffset and endOffset in the original stream are
|
||||
* the same as the term chars.
|
||||
* <p>For example, "abcde" would be tokenized as (minGram=2, maxGram=3):
|
||||
* <table>
|
||||
* <tr><th>Term</th><td>ab</td><td>abc</td><td>bc</td><td>bcd</td><td>cd</td><td>cde</td><td>de</td></tr>
|
||||
* <tr><th>Position increment</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr>
|
||||
* <tr><th>Position length</th><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td><td>1</td></tr>
|
||||
* <tr><th>Offsets</th><td>[0,2[</td><td>[0,3[</td><td>[1,3[</td><td>[1,4[</td><td>[2,4[</td><td>[2,5[</td><td>[3,5[</td></tr>
|
||||
* </table>
|
||||
* <a name="version"/>
|
||||
* <p>Before Lucene 4.4, this class had a different behavior:<ul>
|
||||
* <li>It didn't support more than 1024 chars of input, the rest was trashed.</li>
|
||||
* <li>The last whitespaces of the 1024 chars block were trimmed.</li>
|
||||
* <li>Tokens were emitted in a different order (by increasing lengths).</li></ul>
|
||||
* <p>Although highly discouraged, it is still possible to use the old behavior
|
||||
* through {@link Lucene43NGramTokenizer}.
|
||||
*/
|
||||
public final class XNGramTokenizer extends Tokenizer {
|
||||
public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
|
||||
public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
|
||||
|
||||
static {
|
||||
// LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1476563
|
||||
assert Lucene.VERSION.ordinal() < Version.LUCENE_42.ordinal()+2 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this should can be removed";
|
||||
}
|
||||
|
||||
private char[] buffer;
|
||||
private int bufferStart, bufferEnd; // remaining slice of the buffer
|
||||
private int offset;
|
||||
private int gramSize;
|
||||
private int minGram, maxGram;
|
||||
private boolean exhausted;
|
||||
|
||||
private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
|
||||
private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
|
||||
private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
|
||||
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
|
||||
|
||||
/**
|
||||
* Creates NGramTokenizer with given min and max n-grams.
|
||||
* @param version the lucene compatibility <a href="#version">version</a>
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public XNGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
|
||||
super(input);
|
||||
init(version, minGram, maxGram);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates NGramTokenizer with given min and max n-grams.
|
||||
* @param version the lucene compatibility <a href="#version">version</a>
|
||||
* @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
* @param minGram the smallest n-gram to generate
|
||||
* @param maxGram the largest n-gram to generate
|
||||
*/
|
||||
public XNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) {
|
||||
super(factory, input);
|
||||
init(version, minGram, maxGram);
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates NGramTokenizer with default min and max n-grams.
|
||||
* @param version the lucene compatibility <a href="#version">version</a>
|
||||
* @param input {@link Reader} holding the input to be tokenized
|
||||
*/
|
||||
public XNGramTokenizer(Version version, Reader input) {
|
||||
this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
|
||||
}
|
||||
|
||||
private void init(Version version, int minGram, int maxGram) {
|
||||
if (!version.onOrAfter(Version.LUCENE_42)) {
|
||||
throw new IllegalArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer");
|
||||
}
|
||||
if (minGram < 1) {
|
||||
throw new IllegalArgumentException("minGram must be greater than zero");
|
||||
}
|
||||
if (minGram > maxGram) {
|
||||
throw new IllegalArgumentException("minGram must not be greater than maxGram");
|
||||
}
|
||||
this.minGram = minGram;
|
||||
this.maxGram = maxGram;
|
||||
buffer = new char[maxGram + 1024];
|
||||
}
|
||||
|
||||
/** Returns the next token in the stream, or null at EOS. */
|
||||
@Override
|
||||
public boolean incrementToken() throws IOException {
|
||||
clearAttributes();
|
||||
|
||||
// compact
|
||||
if (bufferStart >= buffer.length - maxGram) {
|
||||
System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
|
||||
bufferEnd -= bufferStart;
|
||||
bufferStart = 0;
|
||||
|
||||
// fill in remaining space
|
||||
if (!exhausted) {
|
||||
// TODO: refactor to a shared readFully
|
||||
while (bufferEnd < buffer.length) {
|
||||
final int read = input.read(buffer, bufferEnd, buffer.length - bufferEnd);
|
||||
if (read == -1) {
|
||||
exhausted = true;
|
||||
break;
|
||||
}
|
||||
bufferEnd += read;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// should we go to the next offset?
|
||||
if (gramSize > maxGram || bufferStart + gramSize > bufferEnd) {
|
||||
bufferStart++;
|
||||
offset++;
|
||||
gramSize = minGram;
|
||||
}
|
||||
|
||||
// are there enough chars remaining?
|
||||
if (bufferStart + gramSize > bufferEnd) {
|
||||
return false;
|
||||
}
|
||||
|
||||
termAtt.copyBuffer(buffer, bufferStart, gramSize);
|
||||
posIncAtt.setPositionIncrement(1);
|
||||
posLenAtt.setPositionLength(1);
|
||||
offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + gramSize));
|
||||
++gramSize;
|
||||
return true;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void end() {
|
||||
final int endOffset = correctOffset(offset + bufferEnd - bufferStart);
|
||||
offsetAtt.setOffset(endOffset, endOffset);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void reset() throws IOException {
|
||||
super.reset();
|
||||
bufferStart = bufferEnd = buffer.length;
|
||||
offset = 0;
|
||||
gramSize = minGram;
|
||||
exhausted = false;
|
||||
}
|
||||
}
|
|
@ -21,6 +21,8 @@ package org.elasticsearch.index.analysis;
|
|||
|
||||
import org.apache.lucene.analysis.TokenStream;
|
||||
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
|
||||
import org.apache.lucene.analysis.ngram.XNGramTokenFilter;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
|
@ -47,6 +49,10 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
|
|||
|
||||
@Override
|
||||
public TokenStream create(TokenStream tokenStream) {
|
||||
if (this.version.onOrAfter(Version.LUCENE_42)) {
|
||||
// LUCENE MONITOR: this token filter is a copy from lucene trunk and should go away once we upgrade to lucene 4.4
|
||||
return new XNGramTokenFilter(version, tokenStream, minGram, maxGram);
|
||||
}
|
||||
return new NGramTokenFilter(tokenStream, minGram, maxGram);
|
||||
}
|
||||
}
|
|
@ -21,6 +21,8 @@ package org.elasticsearch.index.analysis;
|
|||
|
||||
import org.apache.lucene.analysis.Tokenizer;
|
||||
import org.apache.lucene.analysis.ngram.NGramTokenizer;
|
||||
import org.apache.lucene.analysis.ngram.XNGramTokenizer;
|
||||
import org.apache.lucene.util.Version;
|
||||
import org.elasticsearch.common.inject.Inject;
|
||||
import org.elasticsearch.common.inject.assistedinject.Assisted;
|
||||
import org.elasticsearch.common.settings.Settings;
|
||||
|
@ -47,6 +49,10 @@ public class NGramTokenizerFactory extends AbstractTokenizerFactory {
|
|||
|
||||
@Override
|
||||
public Tokenizer create(Reader reader) {
|
||||
if (this.version.onOrAfter(Version.LUCENE_42)) {
|
||||
// LUCENE MONITOR: this token filter is a copy from lucene trunk and should go away once we upgrade to lucene 4.4
|
||||
return new XNGramTokenizer(version, reader, minGram, maxGram);
|
||||
}
|
||||
return new NGramTokenizer(reader, minGram, maxGram);
|
||||
}
|
||||
}
|
|
@ -45,6 +45,7 @@ import org.elasticsearch.search.SearchHit;
|
|||
import org.elasticsearch.search.builder.SearchSourceBuilder;
|
||||
import org.elasticsearch.search.highlight.HighlightBuilder;
|
||||
import org.elasticsearch.test.integration.AbstractNodesTests;
|
||||
import org.hamcrest.Matcher;
|
||||
import org.testng.annotations.AfterClass;
|
||||
import org.testng.annotations.BeforeClass;
|
||||
import org.testng.annotations.Test;
|
||||
|
@ -61,6 +62,10 @@ import static org.elasticsearch.search.builder.SearchSourceBuilder.highlight;
|
|||
import static org.elasticsearch.search.builder.SearchSourceBuilder.searchSource;
|
||||
import static org.hamcrest.MatcherAssert.assertThat;
|
||||
import static org.hamcrest.Matchers.equalTo;
|
||||
import static org.hamcrest.Matchers.greaterThan;
|
||||
import static org.hamcrest.Matchers.notNullValue;
|
||||
|
||||
|
||||
import static org.hamcrest.Matchers.instanceOf;
|
||||
import static org.testng.Assert.fail;
|
||||
|
||||
|
@ -87,6 +92,89 @@ public class HighlighterSearchTests extends AbstractNodesTests {
|
|||
protected Client getClient() {
|
||||
return client("server1");
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void testNgramHighlighting() throws ElasticSearchException, IOException {
|
||||
try {
|
||||
client.admin().indices().prepareDelete("test").execute().actionGet();
|
||||
} catch (Exception e) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
client.admin().indices().prepareCreate("test")
|
||||
.addMapping("test", jsonBuilder()
|
||||
.startObject()
|
||||
.startObject("test")
|
||||
.startObject("properties")
|
||||
.startObject("name")
|
||||
.field("type", "string")
|
||||
.field("index_analyzer", "name_index_analyzer")
|
||||
.field("search_analyzer", "name_search_analyzer")
|
||||
.field("term_vector", "with_positions_offsets")
|
||||
.endObject()
|
||||
.startObject("name2")
|
||||
.field("type", "string")
|
||||
.field("index_analyzer", "name2_index_analyzer")
|
||||
.field("search_analyzer", "name_search_analyzer")
|
||||
.field("term_vector", "with_positions_offsets")
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject()
|
||||
.endObject())
|
||||
.setSettings(ImmutableSettings.settingsBuilder()
|
||||
.put("index.number_of_shards", 2)
|
||||
.put("analysis.filter.my_ngram.max_gram", 20)
|
||||
.put("analysis.filter.my_ngram.min_gram", 1)
|
||||
.put("analysis.filter.my_ngram.type", "ngram")
|
||||
.put("analysis.tokenizer.my_ngramt.max_gram", 20)
|
||||
.put("analysis.tokenizer.my_ngramt.min_gram", 1)
|
||||
.put("analysis.tokenizer.my_ngramt.type", "ngram")
|
||||
.put("analysis.analyzer.name_index_analyzer.tokenizer", "my_ngramt")
|
||||
.put("analysis.analyzer.name2_index_analyzer.tokenizer", "whitespace")
|
||||
.put("analysis.analyzer.name2_index_analyzer.filter", "my_ngram")
|
||||
.put("analysis.analyzer.name_search_analyzer.tokenizer", "whitespace"))
|
||||
.execute().actionGet();
|
||||
client.prepareIndex("test", "test", "1")
|
||||
.setSource(XContentFactory.jsonBuilder()
|
||||
.startObject()
|
||||
.field("name", "logicacmg ehemals avinci - the know how company")
|
||||
.field("name2", "logicacmg ehemals avinci - the know how company")
|
||||
.endObject())
|
||||
.setRefresh(true).execute().actionGet();
|
||||
SearchResponse search = client.prepareSearch().setQuery(matchQuery("name", "logica m")).addHighlightedField("name").execute().actionGet();
|
||||
assertHighlight(search, 0, "name", 0, equalTo("<em>logica</em>c<em>m</em>g ehe<em>m</em>als avinci - the know how co<em>m</em>pany"));
|
||||
|
||||
search = client.prepareSearch().setQuery(matchQuery("name", "logica ma")).addHighlightedField("name").execute()
|
||||
.actionGet();
|
||||
assertHighlight(search, 0, "name", 0, equalTo("<em>logica</em>cmg ehe<em>ma</em>ls avinci - the know how company"));
|
||||
|
||||
search = client.prepareSearch().setQuery(matchQuery("name", "logica")).addHighlightedField("name").execute().actionGet();
|
||||
assertHighlight(search, 0, "name", 0, equalTo("<em>logica</em>cmg ehemals avinci - the know how company"));
|
||||
|
||||
|
||||
|
||||
search = client.prepareSearch().setQuery(matchQuery("name2", "logica m")).addHighlightedField("name2").execute().actionGet();
|
||||
assertHighlight(search, 0, "name2", 0, equalTo("<em>logicacmg</em> <em>ehemals</em> avinci - the know how <em>company</em>"));
|
||||
|
||||
search = client.prepareSearch().setQuery(matchQuery("name2", "logica ma")).addHighlightedField("name2").execute()
|
||||
.actionGet();
|
||||
assertHighlight(search, 0, "name2", 0, equalTo("<em>logicacmg</em> <em>ehemals</em> avinci - the know how company"));
|
||||
|
||||
search = client.prepareSearch().setQuery(matchQuery("name2", "logica")).addHighlightedField("name2").execute().actionGet();
|
||||
assertHighlight(search, 0, "name2", 0, equalTo("<em>logicacmg</em> ehemals avinci - the know how company"));
|
||||
|
||||
|
||||
}
|
||||
|
||||
public void assertHighlight(SearchResponse resp, int hit, String field, int fragment, Matcher<String> matcher) {
|
||||
assertThat(resp.getShardFailures().length, equalTo(0));
|
||||
assertThat(resp.getHits().hits().length, greaterThan(hit));
|
||||
assertThat(resp.getHits().hits()[hit].getHighlightFields().get(field), notNullValue());
|
||||
assertThat(resp.getHits().hits()[hit].getHighlightFields().get(field).fragments().length, greaterThan(fragment));
|
||||
assertThat(resp.getHits().hits()[hit].highlightFields().get(field).fragments()[fragment].string(),
|
||||
matcher);
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
|
@ -158,6 +246,8 @@ public class HighlighterSearchTests extends AbstractNodesTests {
|
|||
} catch (Exception e) {
|
||||
// ignore
|
||||
}
|
||||
|
||||
|
||||
|
||||
client.admin().indices().prepareCreate("test").setSettings(ImmutableSettings.settingsBuilder().put("index.number_of_shards", 2))
|
||||
.addMapping("type1", jsonBuilder().startObject().startObject("type1").startObject("properties")
|
||||
|
|
Loading…
Reference in New Issue