From 77239a76f81879742d21004454691d2dfc6499b4 Mon Sep 17 00:00:00 2001 From: Simon Willnauer Date: Fri, 8 Nov 2013 16:09:18 +0100 Subject: [PATCH] Lazily fill CharTermAttribute if needed in CompletionTokenStream This adds a delegate to CharTermAttributeImpl to be compatible with the Percolator that needs a CharTermAttribute. Yet compared to CharTermAttributImpl we only fill the BytesRef with UTF-8 since we already have it and only if we need to convert to UTF-16 we do it. Closes #4028 --- .../completion/CompletionTokenStream.java | 52 ++++++++++++------- .../suggest/CompletionSuggestSearchTests.java | 34 ++++++++++++ .../suggest/CompletionTokenStreamTest.java | 40 ++++++++++---- 3 files changed, 98 insertions(+), 28 deletions(-) diff --git a/src/main/java/org/elasticsearch/search/suggest/completion/CompletionTokenStream.java b/src/main/java/org/elasticsearch/search/suggest/completion/CompletionTokenStream.java index 260bd57246d..5bf98145850 100644 --- a/src/main/java/org/elasticsearch/search/suggest/completion/CompletionTokenStream.java +++ b/src/main/java/org/elasticsearch/search/suggest/completion/CompletionTokenStream.java @@ -19,12 +19,8 @@ package org.elasticsearch.search.suggest.completion; import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute; -import org.apache.lucene.util.AttributeImpl; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.IntsRef; +import org.apache.lucene.analysis.tokenattributes.*; +import org.apache.lucene.util.*; import org.apache.lucene.util.fst.Util; import java.io.IOException; @@ -36,9 +32,10 @@ import java.util.Set; */ public final class CompletionTokenStream extends TokenStream { - private final PayloadAttribute payloadAttr = addAttribute(PayloadAttribute.class);; + private final PayloadAttribute payloadAttr = addAttribute(PayloadAttribute.class); private final PositionIncrementAttribute posAttr = addAttribute(PositionIncrementAttribute.class); - private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class); + private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class);; + private final TokenStream input; private BytesRef payload; @@ -46,9 +43,11 @@ public final class CompletionTokenStream extends TokenStream { private ToFiniteStrings toFiniteStrings; private int posInc = -1; private static final int MAX_PATHS = 256; - private final BytesRef scratch = new BytesRef(); + private CharTermAttribute charTermAttribute; public CompletionTokenStream(TokenStream input, BytesRef payload, ToFiniteStrings toFiniteStrings) throws IOException { + // Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume + // the input stream entirely in toFiniteStrings(input) this.input = input; this.payload = payload; this.toFiniteStrings = toFiniteStrings; @@ -74,8 +73,11 @@ public final class CompletionTokenStream extends TokenStream { * produced. Multi Fields have the same surface form and therefore sum up */ posInc = 0; - Util.toBytesRef(finiteStrings.next(), scratch); // now we have UTF-8 - bytesAtt.setBytesRef(scratch); + Util.toBytesRef(finiteStrings.next(), bytesAtt.getBytesRef()); // now we have UTF-8 + if (charTermAttribute != null) { + charTermAttribute.setLength(0); + charTermAttribute.append(bytesAtt.toUTF16()); + } if (payload != null) { payloadAttr.setPayload(this.payload); } @@ -107,16 +109,23 @@ public final class CompletionTokenStream extends TokenStream { @Override public void reset() throws IOException { super.reset(); + if (hasAttribute(CharTermAttribute.class)) { + // we only create this if we really need it to safe the UTF-8 to UTF-16 conversion + charTermAttribute = getAttribute(CharTermAttribute.class); + } finiteStrings = null; posInc = -1; } public interface ByteTermAttribute extends TermToBytesRefAttribute { - public void setBytesRef(BytesRef bytes); + // marker interface + + public CharSequence toUTF16(); } public static final class ByteTermAttributeImpl extends AttributeImpl implements ByteTermAttribute, TermToBytesRefAttribute { - private BytesRef bytes; + private final BytesRef bytes = new BytesRef(); + private CharsRef charsRef; @Override public int fillBytesRef() { @@ -128,19 +137,24 @@ public final class CompletionTokenStream extends TokenStream { return bytes; } - @Override - public void setBytesRef(BytesRef bytes) { - this.bytes = bytes; - } - @Override public void clear() { + bytes.length = 0; } @Override public void copyTo(AttributeImpl target) { ByteTermAttributeImpl other = (ByteTermAttributeImpl) target; - other.bytes = bytes; + other.bytes.copyBytes(bytes); + } + + @Override + public CharSequence toUTF16() { + if (charsRef == null) { + charsRef = new CharsRef(); + } + UnicodeUtil.UTF8toUTF16(bytes, charsRef); + return charsRef; } } } \ No newline at end of file diff --git a/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchTests.java b/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchTests.java index 1d518ce4e23..c49695e472e 100644 --- a/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchTests.java +++ b/src/test/java/org/elasticsearch/search/suggest/CompletionSuggestSearchTests.java @@ -28,14 +28,17 @@ import org.elasticsearch.action.admin.indices.segments.IndexShardSegments; import org.elasticsearch.action.admin.indices.segments.ShardSegments; import org.elasticsearch.action.admin.indices.stats.IndicesStatsResponse; import org.elasticsearch.action.index.IndexRequestBuilder; +import org.elasticsearch.action.percolate.PercolateResponse; import org.elasticsearch.action.search.SearchPhaseExecutionException; import org.elasticsearch.action.suggest.SuggestResponse; +import org.elasticsearch.client.Requests; import org.elasticsearch.common.settings.ImmutableSettings; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.xcontent.XContentBuilder; import org.elasticsearch.index.mapper.MapperException; import org.elasticsearch.index.mapper.MapperParsingException; import org.elasticsearch.index.mapper.core.CompletionFieldMapper; +import org.elasticsearch.percolator.PercolatorService; import org.elasticsearch.search.sort.FieldSortBuilder; import org.elasticsearch.search.suggest.completion.CompletionStats; import org.elasticsearch.search.suggest.completion.CompletionSuggestion; @@ -56,6 +59,7 @@ import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS; import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder; import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder; +import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery; import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoFailures; import static org.hamcrest.Matchers.*; @@ -89,6 +93,36 @@ public class CompletionSuggestSearchTests extends ElasticsearchIntegrationTest { assertSuggestionsNotInOrder("t", "The Prodigy", "Turbonegro", "Turbonegro Get it on", "The Prodigy Firestarter"); } + @Test + public void testSuggestFieldWithPercolateApi() throws Exception { + createIndexAndMapping(); + String[][] input = {{"Foo Fighters"}, {"Foo Fighters"}, {"Foo Fighters"}, {"Foo Fighters"}, + {"Generator", "Foo Fighters Generator"}, {"Learn to Fly", "Foo Fighters Learn to Fly"}, + {"The Prodigy"}, {"The Prodigy"}, {"The Prodigy"}, {"Firestarter", "The Prodigy Firestarter"}, + {"Turbonegro"}, {"Turbonegro"}, {"Get it on", "Turbonegro Get it on"}}; // work with frequencies + for (int i = 0; i < input.length; i++) { + client().prepareIndex(INDEX, TYPE, "" + i) + .setSource(jsonBuilder() + .startObject().startObject(FIELD) + .startArray("input").value(input[i]).endArray() + .endObject() + .endObject() + ) + .execute().actionGet(); + } + + client().prepareIndex(INDEX, PercolatorService.TYPE_NAME, "4") + .setSource(jsonBuilder().startObject().field("query", matchAllQuery()).endObject()) + .execute().actionGet(); + + refresh(); + + PercolateResponse response = client().preparePercolate().setIndices(INDEX).setDocumentType(TYPE) + .setGetRequest(Requests.getRequest(INDEX).type(TYPE).id("1")) + .execute().actionGet(); + assertThat(response.getCount(), equalTo(1l)); + } + @Test public void testBasicPrefixSuggestion() throws Exception { createIndexAndMapping(); diff --git a/src/test/java/org/elasticsearch/search/suggest/CompletionTokenStreamTest.java b/src/test/java/org/elasticsearch/search/suggest/CompletionTokenStreamTest.java index d5ebfa5e19b..974652edc86 100644 --- a/src/test/java/org/elasticsearch/search/suggest/CompletionTokenStreamTest.java +++ b/src/test/java/org/elasticsearch/search/suggest/CompletionTokenStreamTest.java @@ -25,10 +25,7 @@ import org.apache.lucene.analysis.core.SimpleAnalyzer; import org.apache.lucene.analysis.synonym.SynonymFilter; import org.apache.lucene.analysis.synonym.SynonymMap; import org.apache.lucene.analysis.synonym.SynonymMap.Builder; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.PayloadAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; +import org.apache.lucene.analysis.tokenattributes.*; import org.apache.lucene.search.suggest.analyzing.XAnalyzingSuggester; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.CharsRef; @@ -42,6 +39,8 @@ import java.io.IOException; import java.io.StringReader; import java.util.Set; +import static org.hamcrest.Matchers.equalTo; + public class CompletionTokenStreamTest extends ElasticsearchTokenStreamTestCase { final XAnalyzingSuggester suggester = new XAnalyzingSuggester(new SimpleAnalyzer(TEST_VERSION_CURRENT)); @@ -143,12 +142,36 @@ public class CompletionTokenStreamTest extends ElasticsearchTokenStreamTestCase suggestTokenStream.close(); } - + + @Test + public void testSuggestTokenFilterProperlyDelegateInputStream() throws Exception { + TokenStream tokenStream = new MockTokenizer(new StringReader("mykeyword"), MockTokenizer.WHITESPACE, true); + BytesRef payload = new BytesRef("Surface keyword|friggin payload|10"); + TokenStream suggestTokenStream = new ByteTermAttrToCharTermAttrFilter(new CompletionTokenStream(tokenStream, payload, new CompletionTokenStream.ToFiniteStrings() { + @Override + public Set toFiniteStrings(TokenStream stream) throws IOException { + return suggester.toFiniteStrings(suggester.getTokenStreamToAutomaton(), stream); + } + })); + TermToBytesRefAttribute termAtt = suggestTokenStream.getAttribute(TermToBytesRefAttribute.class); + BytesRef ref = termAtt.getBytesRef(); + assertNotNull(ref); + suggestTokenStream.reset(); + + while (suggestTokenStream.incrementToken()) { + termAtt.fillBytesRef(); + assertThat(ref.utf8ToString(), equalTo("mykeyword")); + } + suggestTokenStream.end(); + suggestTokenStream.close(); + } + + public final static class ByteTermAttrToCharTermAttrFilter extends TokenFilter { - private CharTermAttribute attr = addAttribute(CharTermAttribute.class); private ByteTermAttribute byteAttr = addAttribute(ByteTermAttribute.class); private PayloadAttribute payload = addAttribute(PayloadAttribute.class); private TypeAttribute type = addAttribute(TypeAttribute.class); + private CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class); protected ByteTermAttrToCharTermAttrFilter(TokenStream input) { super(input); } @@ -157,13 +180,12 @@ public class CompletionTokenStreamTest extends ElasticsearchTokenStreamTestCase public boolean incrementToken() throws IOException { if (input.incrementToken()) { BytesRef bytesRef = byteAttr.getBytesRef(); - attr.append(bytesRef.utf8ToString()); // we move them over so we can assert them more easily in the tests - type.setType(payload.getPayload().utf8ToString()); + type.setType(payload.getPayload().utf8ToString()); return true; } return false; } - + } }