Lazily fill CharTermAttribute if needed in CompletionTokenStream

This adds a delegate to CharTermAttributeImpl to be compatible
with the Percolator that needs a CharTermAttribute. Yet compared
to CharTermAttributImpl we only fill the BytesRef with UTF-8 since
we already have it and only if we need to convert to UTF-16 we do it.

Closes #4028
This commit is contained in:
Simon Willnauer 2013-11-08 16:09:18 +01:00
parent f7d5d1e5c9
commit 77239a76f8
3 changed files with 98 additions and 28 deletions

View File

@ -19,12 +19,8 @@
package org.elasticsearch.search.suggest.completion;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.util.AttributeImpl;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IntsRef;
import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.util.*;
import org.apache.lucene.util.fst.Util;
import java.io.IOException;
@ -36,9 +32,10 @@ import java.util.Set;
*/
public final class CompletionTokenStream extends TokenStream {
private final PayloadAttribute payloadAttr = addAttribute(PayloadAttribute.class);;
private final PayloadAttribute payloadAttr = addAttribute(PayloadAttribute.class);
private final PositionIncrementAttribute posAttr = addAttribute(PositionIncrementAttribute.class);
private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class);
private final ByteTermAttribute bytesAtt = addAttribute(ByteTermAttribute.class);;
private final TokenStream input;
private BytesRef payload;
@ -46,9 +43,11 @@ public final class CompletionTokenStream extends TokenStream {
private ToFiniteStrings toFiniteStrings;
private int posInc = -1;
private static final int MAX_PATHS = 256;
private final BytesRef scratch = new BytesRef();
private CharTermAttribute charTermAttribute;
public CompletionTokenStream(TokenStream input, BytesRef payload, ToFiniteStrings toFiniteStrings) throws IOException {
// Don't call the super(input) ctor - this is a true delegate and has a new attribute source since we consume
// the input stream entirely in toFiniteStrings(input)
this.input = input;
this.payload = payload;
this.toFiniteStrings = toFiniteStrings;
@ -74,8 +73,11 @@ public final class CompletionTokenStream extends TokenStream {
* produced. Multi Fields have the same surface form and therefore sum up
*/
posInc = 0;
Util.toBytesRef(finiteStrings.next(), scratch); // now we have UTF-8
bytesAtt.setBytesRef(scratch);
Util.toBytesRef(finiteStrings.next(), bytesAtt.getBytesRef()); // now we have UTF-8
if (charTermAttribute != null) {
charTermAttribute.setLength(0);
charTermAttribute.append(bytesAtt.toUTF16());
}
if (payload != null) {
payloadAttr.setPayload(this.payload);
}
@ -107,16 +109,23 @@ public final class CompletionTokenStream extends TokenStream {
@Override
public void reset() throws IOException {
super.reset();
if (hasAttribute(CharTermAttribute.class)) {
// we only create this if we really need it to safe the UTF-8 to UTF-16 conversion
charTermAttribute = getAttribute(CharTermAttribute.class);
}
finiteStrings = null;
posInc = -1;
}
public interface ByteTermAttribute extends TermToBytesRefAttribute {
public void setBytesRef(BytesRef bytes);
// marker interface
public CharSequence toUTF16();
}
public static final class ByteTermAttributeImpl extends AttributeImpl implements ByteTermAttribute, TermToBytesRefAttribute {
private BytesRef bytes;
private final BytesRef bytes = new BytesRef();
private CharsRef charsRef;
@Override
public int fillBytesRef() {
@ -128,19 +137,24 @@ public final class CompletionTokenStream extends TokenStream {
return bytes;
}
@Override
public void setBytesRef(BytesRef bytes) {
this.bytes = bytes;
}
@Override
public void clear() {
bytes.length = 0;
}
@Override
public void copyTo(AttributeImpl target) {
ByteTermAttributeImpl other = (ByteTermAttributeImpl) target;
other.bytes = bytes;
other.bytes.copyBytes(bytes);
}
@Override
public CharSequence toUTF16() {
if (charsRef == null) {
charsRef = new CharsRef();
}
UnicodeUtil.UTF8toUTF16(bytes, charsRef);
return charsRef;
}
}
}

View File

@ -28,14 +28,17 @@ import org.elasticsearch.action.admin.indices.segments.IndexShardSegments;
import org.elasticsearch.action.admin.indices.segments.ShardSegments;
import org.elasticsearch.action.admin.indices.stats.IndicesStatsResponse;
import org.elasticsearch.action.index.IndexRequestBuilder;
import org.elasticsearch.action.percolate.PercolateResponse;
import org.elasticsearch.action.search.SearchPhaseExecutionException;
import org.elasticsearch.action.suggest.SuggestResponse;
import org.elasticsearch.client.Requests;
import org.elasticsearch.common.settings.ImmutableSettings;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.xcontent.XContentBuilder;
import org.elasticsearch.index.mapper.MapperException;
import org.elasticsearch.index.mapper.MapperParsingException;
import org.elasticsearch.index.mapper.core.CompletionFieldMapper;
import org.elasticsearch.percolator.PercolatorService;
import org.elasticsearch.search.sort.FieldSortBuilder;
import org.elasticsearch.search.suggest.completion.CompletionStats;
import org.elasticsearch.search.suggest.completion.CompletionSuggestion;
@ -56,6 +59,7 @@ import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF
import static org.elasticsearch.cluster.metadata.IndexMetaData.SETTING_NUMBER_OF_SHARDS;
import static org.elasticsearch.common.settings.ImmutableSettings.settingsBuilder;
import static org.elasticsearch.common.xcontent.XContentFactory.jsonBuilder;
import static org.elasticsearch.index.query.QueryBuilders.matchAllQuery;
import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoFailures;
import static org.hamcrest.Matchers.*;
@ -89,6 +93,36 @@ public class CompletionSuggestSearchTests extends ElasticsearchIntegrationTest {
assertSuggestionsNotInOrder("t", "The Prodigy", "Turbonegro", "Turbonegro Get it on", "The Prodigy Firestarter");
}
@Test
public void testSuggestFieldWithPercolateApi() throws Exception {
createIndexAndMapping();
String[][] input = {{"Foo Fighters"}, {"Foo Fighters"}, {"Foo Fighters"}, {"Foo Fighters"},
{"Generator", "Foo Fighters Generator"}, {"Learn to Fly", "Foo Fighters Learn to Fly"},
{"The Prodigy"}, {"The Prodigy"}, {"The Prodigy"}, {"Firestarter", "The Prodigy Firestarter"},
{"Turbonegro"}, {"Turbonegro"}, {"Get it on", "Turbonegro Get it on"}}; // work with frequencies
for (int i = 0; i < input.length; i++) {
client().prepareIndex(INDEX, TYPE, "" + i)
.setSource(jsonBuilder()
.startObject().startObject(FIELD)
.startArray("input").value(input[i]).endArray()
.endObject()
.endObject()
)
.execute().actionGet();
}
client().prepareIndex(INDEX, PercolatorService.TYPE_NAME, "4")
.setSource(jsonBuilder().startObject().field("query", matchAllQuery()).endObject())
.execute().actionGet();
refresh();
PercolateResponse response = client().preparePercolate().setIndices(INDEX).setDocumentType(TYPE)
.setGetRequest(Requests.getRequest(INDEX).type(TYPE).id("1"))
.execute().actionGet();
assertThat(response.getCount(), equalTo(1l));
}
@Test
public void testBasicPrefixSuggestion() throws Exception {
createIndexAndMapping();

View File

@ -25,10 +25,7 @@ import org.apache.lucene.analysis.core.SimpleAnalyzer;
import org.apache.lucene.analysis.synonym.SynonymFilter;
import org.apache.lucene.analysis.synonym.SynonymMap;
import org.apache.lucene.analysis.synonym.SynonymMap.Builder;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
import org.apache.lucene.analysis.tokenattributes.*;
import org.apache.lucene.search.suggest.analyzing.XAnalyzingSuggester;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
@ -42,6 +39,8 @@ import java.io.IOException;
import java.io.StringReader;
import java.util.Set;
import static org.hamcrest.Matchers.equalTo;
public class CompletionTokenStreamTest extends ElasticsearchTokenStreamTestCase {
final XAnalyzingSuggester suggester = new XAnalyzingSuggester(new SimpleAnalyzer(TEST_VERSION_CURRENT));
@ -143,12 +142,36 @@ public class CompletionTokenStreamTest extends ElasticsearchTokenStreamTestCase
suggestTokenStream.close();
}
@Test
public void testSuggestTokenFilterProperlyDelegateInputStream() throws Exception {
TokenStream tokenStream = new MockTokenizer(new StringReader("mykeyword"), MockTokenizer.WHITESPACE, true);
BytesRef payload = new BytesRef("Surface keyword|friggin payload|10");
TokenStream suggestTokenStream = new ByteTermAttrToCharTermAttrFilter(new CompletionTokenStream(tokenStream, payload, new CompletionTokenStream.ToFiniteStrings() {
@Override
public Set<IntsRef> toFiniteStrings(TokenStream stream) throws IOException {
return suggester.toFiniteStrings(suggester.getTokenStreamToAutomaton(), stream);
}
}));
TermToBytesRefAttribute termAtt = suggestTokenStream.getAttribute(TermToBytesRefAttribute.class);
BytesRef ref = termAtt.getBytesRef();
assertNotNull(ref);
suggestTokenStream.reset();
while (suggestTokenStream.incrementToken()) {
termAtt.fillBytesRef();
assertThat(ref.utf8ToString(), equalTo("mykeyword"));
}
suggestTokenStream.end();
suggestTokenStream.close();
}
public final static class ByteTermAttrToCharTermAttrFilter extends TokenFilter {
private CharTermAttribute attr = addAttribute(CharTermAttribute.class);
private ByteTermAttribute byteAttr = addAttribute(ByteTermAttribute.class);
private PayloadAttribute payload = addAttribute(PayloadAttribute.class);
private TypeAttribute type = addAttribute(TypeAttribute.class);
private CharTermAttribute charTermAttribute = addAttribute(CharTermAttribute.class);
protected ByteTermAttrToCharTermAttrFilter(TokenStream input) {
super(input);
}
@ -157,13 +180,12 @@ public class CompletionTokenStreamTest extends ElasticsearchTokenStreamTestCase
public boolean incrementToken() throws IOException {
if (input.incrementToken()) {
BytesRef bytesRef = byteAttr.getBytesRef();
attr.append(bytesRef.utf8ToString());
// we move them over so we can assert them more easily in the tests
type.setType(payload.getPayload().utf8ToString());
type.setType(payload.getPayload().utf8ToString());
return true;
}
return false;
}
}
}