diff --git a/src/main/java/org/apache/lucene/analysis/ngram/XNGramTokenFilter.java b/src/main/java/org/apache/lucene/analysis/ngram/XNGramTokenFilter.java new file mode 100644 index 00000000000..81888c96ba1 --- /dev/null +++ b/src/main/java/org/apache/lucene/analysis/ngram/XNGramTokenFilter.java @@ -0,0 +1,187 @@ +package org.apache.lucene.analysis.ngram; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.miscellaneous.LengthFilter; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.util.Version; +import org.elasticsearch.common.lucene.Lucene; + +/** + * Tokenizes the input into n-grams of the given size(s). + * + *
You must specify the required {@link Version} compatibility when + * creating a {@link NGramTokenFilter}. As of Lucene 4.4, this token filters:
You can make this filter use the old behavior by providing a version < + * {@link Version#LUCENE_44} in the constructor but this is not recommended as + * it will lead to broken {@link TokenStream}s that will cause highlighting + * bugs. + */ +public final class XNGramTokenFilter extends TokenFilter { + + static { + // LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1476563 + assert Lucene.VERSION.ordinal() < Version.LUCENE_42.ordinal()+2 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this should can be removed"; + } + public static final int DEFAULT_MIN_NGRAM_SIZE = 1; + public static final int DEFAULT_MAX_NGRAM_SIZE = 2; + + private final int minGram, maxGram; + + private char[] curTermBuffer; + private int curTermLength; + private int curGramSize; + private int curPos; + private int curPosInc, curPosLen; + private int tokStart; + private int tokEnd; + private boolean hasIllegalOffsets; // only if the length changed before this filter + + private final Version version; + private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + private final PositionIncrementAttribute posIncAtt; + private final PositionLengthAttribute posLenAtt; + private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + + /** + * Creates NGramTokenFilter with given min and max n-grams. + * @param version Lucene version to enable correct position increments. + * See above for details. + * @param input {@link TokenStream} holding the input to be tokenized + * @param minGram the smallest n-gram to generate + * @param maxGram the largest n-gram to generate + */ + public XNGramTokenFilter(Version version, TokenStream input, int minGram, int maxGram) { + super(new LengthFilter(true, input, minGram, Integer.MAX_VALUE)); + this.version = version; + if (minGram < 1) { + throw new IllegalArgumentException("minGram must be greater than zero"); + } + if (minGram > maxGram) { + throw new IllegalArgumentException("minGram must not be greater than maxGram"); + } + this.minGram = minGram; + this.maxGram = maxGram; + if (version.onOrAfter(Version.LUCENE_42)) { + posIncAtt = addAttribute(PositionIncrementAttribute.class); + posLenAtt = addAttribute(PositionLengthAttribute.class); + } else { + posIncAtt = new PositionIncrementAttribute() { + @Override + public void setPositionIncrement(int positionIncrement) {} + @Override + public int getPositionIncrement() { + return 0; + } + }; + posLenAtt = new PositionLengthAttribute() { + @Override + public void setPositionLength(int positionLength) {} + @Override + public int getPositionLength() { + return 0; + } + }; + } + } + + /** + * Creates NGramTokenFilter with default min and max n-grams. + * @param version Lucene version to enable correct position increments. + * See above for details. + * @param input {@link TokenStream} holding the input to be tokenized + */ + public XNGramTokenFilter(Version version, TokenStream input) { + this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE); + } + + /** Returns the next token in the stream, or null at EOS. */ + @Override + public final boolean incrementToken() throws IOException { + while (true) { + if (curTermBuffer == null) { + if (!input.incrementToken()) { + return false; + } else { + curTermBuffer = termAtt.buffer().clone(); + curTermLength = termAtt.length(); + curGramSize = minGram; + curPos = 0; + curPosInc = posIncAtt.getPositionIncrement(); + curPosLen = posLenAtt.getPositionLength(); + tokStart = offsetAtt.startOffset(); + tokEnd = offsetAtt.endOffset(); + // if length by start + end offsets doesn't match the term text then assume + // this is a synonym and don't adjust the offsets. + hasIllegalOffsets = (tokStart + curTermLength) != tokEnd; + } + } + if (version.onOrAfter(Version.LUCENE_42)) { + if (curGramSize > maxGram || curPos + curGramSize > curTermLength) { + ++curPos; + curGramSize = minGram; + } + if (curPos + curGramSize <= curTermLength) { + clearAttributes(); + termAtt.copyBuffer(curTermBuffer, curPos, curGramSize); + posIncAtt.setPositionIncrement(curPosInc); + curPosInc = 0; + posLenAtt.setPositionLength(curPosLen); + offsetAtt.setOffset(tokStart, tokEnd); + curGramSize++; + return true; + } + } else { + while (curGramSize <= maxGram) { + while (curPos+curGramSize <= curTermLength) { // while there is input + clearAttributes(); + termAtt.copyBuffer(curTermBuffer, curPos, curGramSize); + if (hasIllegalOffsets) { + offsetAtt.setOffset(tokStart, tokEnd); + } else { + offsetAtt.setOffset(tokStart + curPos, tokStart + curPos + curGramSize); + } + curPos++; + return true; + } + curGramSize++; // increase n-gram size + curPos = 0; + } + } + curTermBuffer = null; + } + } + + @Override + public void reset() throws IOException { + super.reset(); + curTermBuffer = null; + } +} diff --git a/src/main/java/org/apache/lucene/analysis/ngram/XNGramTokenizer.java b/src/main/java/org/apache/lucene/analysis/ngram/XNGramTokenizer.java new file mode 100644 index 00000000000..1b174120905 --- /dev/null +++ b/src/main/java/org/apache/lucene/analysis/ngram/XNGramTokenizer.java @@ -0,0 +1,180 @@ +package org.apache.lucene.analysis.ngram; + +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.Reader; + +import org.apache.lucene.analysis.Tokenizer; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionLengthAttribute; +import org.apache.lucene.util.Version; +import org.elasticsearch.common.lucene.Lucene; + +/** + * Tokenizes the input into n-grams of the given size(s). + *
On the contrary to {@link NGramTokenFilter}, this class sets offsets so + * that characters between startOffset and endOffset in the original stream are + * the same as the term chars. + *
For example, "abcde" would be tokenized as (minGram=2, maxGram=3): + *
Term | ab | abc | bc | bcd | cd | cde | de |
---|---|---|---|---|---|---|---|
Position increment | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
Position length | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
Offsets | [0,2[ | [0,3[ | [1,3[ | [1,4[ | [2,4[ | [2,5[ | [3,5[ |
Before Lucene 4.4, this class had a different behavior:
Although highly discouraged, it is still possible to use the old behavior
+ * through {@link Lucene43NGramTokenizer}.
+ */
+public final class XNGramTokenizer extends Tokenizer {
+ public static final int DEFAULT_MIN_NGRAM_SIZE = 1;
+ public static final int DEFAULT_MAX_NGRAM_SIZE = 2;
+
+ static {
+ // LUCENE MONITOR: this should be in Lucene 4.4 copied from Revision: 1476563
+ assert Lucene.VERSION.ordinal() < Version.LUCENE_42.ordinal()+2 : "Elasticsearch has upgraded to Lucene Version: [" + Lucene.VERSION + "] this should can be removed";
+ }
+
+ private char[] buffer;
+ private int bufferStart, bufferEnd; // remaining slice of the buffer
+ private int offset;
+ private int gramSize;
+ private int minGram, maxGram;
+ private boolean exhausted;
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+ private final PositionLengthAttribute posLenAtt = addAttribute(PositionLengthAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+ /**
+ * Creates NGramTokenizer with given min and max n-grams.
+ * @param version the lucene compatibility version
+ * @param input {@link Reader} holding the input to be tokenized
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public XNGramTokenizer(Version version, Reader input, int minGram, int maxGram) {
+ super(input);
+ init(version, minGram, maxGram);
+ }
+
+ /**
+ * Creates NGramTokenizer with given min and max n-grams.
+ * @param version the lucene compatibility version
+ * @param factory {@link org.apache.lucene.util.AttributeSource.AttributeFactory} to use
+ * @param input {@link Reader} holding the input to be tokenized
+ * @param minGram the smallest n-gram to generate
+ * @param maxGram the largest n-gram to generate
+ */
+ public XNGramTokenizer(Version version, AttributeFactory factory, Reader input, int minGram, int maxGram) {
+ super(factory, input);
+ init(version, minGram, maxGram);
+ }
+
+ /**
+ * Creates NGramTokenizer with default min and max n-grams.
+ * @param version the lucene compatibility version
+ * @param input {@link Reader} holding the input to be tokenized
+ */
+ public XNGramTokenizer(Version version, Reader input) {
+ this(version, input, DEFAULT_MIN_NGRAM_SIZE, DEFAULT_MAX_NGRAM_SIZE);
+ }
+
+ private void init(Version version, int minGram, int maxGram) {
+ if (!version.onOrAfter(Version.LUCENE_42)) {
+ throw new IllegalArgumentException("This class only works with Lucene 4.4+. To emulate the old (broken) behavior of NGramTokenizer, use Lucene43NGramTokenizer");
+ }
+ if (minGram < 1) {
+ throw new IllegalArgumentException("minGram must be greater than zero");
+ }
+ if (minGram > maxGram) {
+ throw new IllegalArgumentException("minGram must not be greater than maxGram");
+ }
+ this.minGram = minGram;
+ this.maxGram = maxGram;
+ buffer = new char[maxGram + 1024];
+ }
+
+ /** Returns the next token in the stream, or null at EOS. */
+ @Override
+ public boolean incrementToken() throws IOException {
+ clearAttributes();
+
+ // compact
+ if (bufferStart >= buffer.length - maxGram) {
+ System.arraycopy(buffer, bufferStart, buffer, 0, bufferEnd - bufferStart);
+ bufferEnd -= bufferStart;
+ bufferStart = 0;
+
+ // fill in remaining space
+ if (!exhausted) {
+ // TODO: refactor to a shared readFully
+ while (bufferEnd < buffer.length) {
+ final int read = input.read(buffer, bufferEnd, buffer.length - bufferEnd);
+ if (read == -1) {
+ exhausted = true;
+ break;
+ }
+ bufferEnd += read;
+ }
+ }
+ }
+
+ // should we go to the next offset?
+ if (gramSize > maxGram || bufferStart + gramSize > bufferEnd) {
+ bufferStart++;
+ offset++;
+ gramSize = minGram;
+ }
+
+ // are there enough chars remaining?
+ if (bufferStart + gramSize > bufferEnd) {
+ return false;
+ }
+
+ termAtt.copyBuffer(buffer, bufferStart, gramSize);
+ posIncAtt.setPositionIncrement(1);
+ posLenAtt.setPositionLength(1);
+ offsetAtt.setOffset(correctOffset(offset), correctOffset(offset + gramSize));
+ ++gramSize;
+ return true;
+ }
+
+ @Override
+ public void end() {
+ final int endOffset = correctOffset(offset + bufferEnd - bufferStart);
+ offsetAtt.setOffset(endOffset, endOffset);
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ bufferStart = bufferEnd = buffer.length;
+ offset = 0;
+ gramSize = minGram;
+ exhausted = false;
+ }
+}
diff --git a/src/main/java/org/elasticsearch/index/analysis/NGramTokenFilterFactory.java b/src/main/java/org/elasticsearch/index/analysis/NGramTokenFilterFactory.java
index 8b9964804cf..aebf05b9028 100644
--- a/src/main/java/org/elasticsearch/index/analysis/NGramTokenFilterFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/NGramTokenFilterFactory.java
@@ -21,6 +21,8 @@ package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.ngram.NGramTokenFilter;
+import org.apache.lucene.analysis.ngram.XNGramTokenFilter;
+import org.apache.lucene.util.Version;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
@@ -47,6 +49,10 @@ public class NGramTokenFilterFactory extends AbstractTokenFilterFactory {
@Override
public TokenStream create(TokenStream tokenStream) {
+ if (this.version.onOrAfter(Version.LUCENE_42)) {
+ // LUCENE MONITOR: this token filter is a copy from lucene trunk and should go away once we upgrade to lucene 4.4
+ return new XNGramTokenFilter(version, tokenStream, minGram, maxGram);
+ }
return new NGramTokenFilter(tokenStream, minGram, maxGram);
}
}
\ No newline at end of file
diff --git a/src/main/java/org/elasticsearch/index/analysis/NGramTokenizerFactory.java b/src/main/java/org/elasticsearch/index/analysis/NGramTokenizerFactory.java
index f7084cf0bd2..f99af2ed444 100644
--- a/src/main/java/org/elasticsearch/index/analysis/NGramTokenizerFactory.java
+++ b/src/main/java/org/elasticsearch/index/analysis/NGramTokenizerFactory.java
@@ -21,6 +21,8 @@ package org.elasticsearch.index.analysis;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.ngram.NGramTokenizer;
+import org.apache.lucene.analysis.ngram.XNGramTokenizer;
+import org.apache.lucene.util.Version;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.assistedinject.Assisted;
import org.elasticsearch.common.settings.Settings;
@@ -47,6 +49,10 @@ public class NGramTokenizerFactory extends AbstractTokenizerFactory {
@Override
public Tokenizer create(Reader reader) {
+ if (this.version.onOrAfter(Version.LUCENE_42)) {
+ // LUCENE MONITOR: this token filter is a copy from lucene trunk and should go away once we upgrade to lucene 4.4
+ return new XNGramTokenizer(version, reader, minGram, maxGram);
+ }
return new NGramTokenizer(reader, minGram, maxGram);
}
}
\ No newline at end of file
diff --git a/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java b/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java
index 65ffabec45a..4a9ec3cc93f 100644
--- a/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java
+++ b/src/test/java/org/elasticsearch/test/integration/search/highlight/HighlighterSearchTests.java
@@ -45,6 +45,7 @@ import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;
import org.elasticsearch.search.highlight.HighlightBuilder;
import org.elasticsearch.test.integration.AbstractNodesTests;
+import org.hamcrest.Matcher;
import org.testng.annotations.AfterClass;
import org.testng.annotations.BeforeClass;
import org.testng.annotations.Test;
@@ -61,6 +62,10 @@ import static org.elasticsearch.search.builder.SearchSourceBuilder.highlight;
import static org.elasticsearch.search.builder.SearchSourceBuilder.searchSource;
import static org.hamcrest.MatcherAssert.assertThat;
import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.greaterThan;
+import static org.hamcrest.Matchers.notNullValue;
+
+
import static org.hamcrest.Matchers.instanceOf;
import static org.testng.Assert.fail;
@@ -87,6 +92,89 @@ public class HighlighterSearchTests extends AbstractNodesTests {
protected Client getClient() {
return client("server1");
}
+
+
+ @Test
+ public void testNgramHighlighting() throws ElasticSearchException, IOException {
+ try {
+ client.admin().indices().prepareDelete("test").execute().actionGet();
+ } catch (Exception e) {
+ // ignore
+ }
+
+ client.admin().indices().prepareCreate("test")
+ .addMapping("test", jsonBuilder()
+ .startObject()
+ .startObject("test")
+ .startObject("properties")
+ .startObject("name")
+ .field("type", "string")
+ .field("index_analyzer", "name_index_analyzer")
+ .field("search_analyzer", "name_search_analyzer")
+ .field("term_vector", "with_positions_offsets")
+ .endObject()
+ .startObject("name2")
+ .field("type", "string")
+ .field("index_analyzer", "name2_index_analyzer")
+ .field("search_analyzer", "name_search_analyzer")
+ .field("term_vector", "with_positions_offsets")
+ .endObject()
+ .endObject()
+ .endObject()
+ .endObject())
+ .setSettings(ImmutableSettings.settingsBuilder()
+ .put("index.number_of_shards", 2)
+ .put("analysis.filter.my_ngram.max_gram", 20)
+ .put("analysis.filter.my_ngram.min_gram", 1)
+ .put("analysis.filter.my_ngram.type", "ngram")
+ .put("analysis.tokenizer.my_ngramt.max_gram", 20)
+ .put("analysis.tokenizer.my_ngramt.min_gram", 1)
+ .put("analysis.tokenizer.my_ngramt.type", "ngram")
+ .put("analysis.analyzer.name_index_analyzer.tokenizer", "my_ngramt")
+ .put("analysis.analyzer.name2_index_analyzer.tokenizer", "whitespace")
+ .put("analysis.analyzer.name2_index_analyzer.filter", "my_ngram")
+ .put("analysis.analyzer.name_search_analyzer.tokenizer", "whitespace"))
+ .execute().actionGet();
+ client.prepareIndex("test", "test", "1")
+ .setSource(XContentFactory.jsonBuilder()
+ .startObject()
+ .field("name", "logicacmg ehemals avinci - the know how company")
+ .field("name2", "logicacmg ehemals avinci - the know how company")
+ .endObject())
+ .setRefresh(true).execute().actionGet();
+ SearchResponse search = client.prepareSearch().setQuery(matchQuery("name", "logica m")).addHighlightedField("name").execute().actionGet();
+ assertHighlight(search, 0, "name", 0, equalTo("logicacmg ehemals avinci - the know how company"));
+
+ search = client.prepareSearch().setQuery(matchQuery("name", "logica ma")).addHighlightedField("name").execute()
+ .actionGet();
+ assertHighlight(search, 0, "name", 0, equalTo("logicacmg ehemals avinci - the know how company"));
+
+ search = client.prepareSearch().setQuery(matchQuery("name", "logica")).addHighlightedField("name").execute().actionGet();
+ assertHighlight(search, 0, "name", 0, equalTo("logicacmg ehemals avinci - the know how company"));
+
+
+
+ search = client.prepareSearch().setQuery(matchQuery("name2", "logica m")).addHighlightedField("name2").execute().actionGet();
+ assertHighlight(search, 0, "name2", 0, equalTo("logicacmg ehemals avinci - the know how company"));
+
+ search = client.prepareSearch().setQuery(matchQuery("name2", "logica ma")).addHighlightedField("name2").execute()
+ .actionGet();
+ assertHighlight(search, 0, "name2", 0, equalTo("logicacmg ehemals avinci - the know how company"));
+
+ search = client.prepareSearch().setQuery(matchQuery("name2", "logica")).addHighlightedField("name2").execute().actionGet();
+ assertHighlight(search, 0, "name2", 0, equalTo("logicacmg ehemals avinci - the know how company"));
+
+
+ }
+
+ public void assertHighlight(SearchResponse resp, int hit, String field, int fragment, Matcher